diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12721 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 7642, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.21875, + "learning_rate": 6.535947712418301e-09, + "logits/chosen": -2.2813315391540527, + "logits/rejected": -2.01680850982666, + "logps/chosen": -216.2415771484375, + "logps/rejected": -121.72990417480469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2.78125, + "learning_rate": 6.535947712418302e-08, + "logits/chosen": -2.4226467609405518, + "logits/rejected": -2.36716365814209, + "logps/chosen": -281.0023498535156, + "logps/rejected": -206.8296661376953, + "loss": 0.6932, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.00011049283784814179, + "rewards/margins": -0.00019701628480106592, + "rewards/rejected": 0.00030750909354537725, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 3.265625, + "learning_rate": 1.3071895424836603e-07, + "logits/chosen": -2.4675583839416504, + "logits/rejected": -2.504241466522217, + "logps/chosen": -238.65823364257812, + "logps/rejected": -216.17611694335938, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00041228701593354344, + "rewards/margins": 8.625028567621484e-05, + "rewards/rejected": 0.00032603665022179484, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 3.453125, + "learning_rate": 1.9607843137254904e-07, + "logits/chosen": -2.4760632514953613, + "logits/rejected": -2.4448161125183105, + "logps/chosen": -226.6892547607422, + "logps/rejected": -225.68603515625, + "loss": 0.6929, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0022668808232992887, + "rewards/margins": 0.0005866141873411834, + "rewards/rejected": 0.0016802664613351226, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 3.203125, + "learning_rate": 2.6143790849673207e-07, + "logits/chosen": -2.5491480827331543, + "logits/rejected": -2.4344499111175537, + "logps/chosen": -260.48663330078125, + "logps/rejected": -297.4029235839844, + "loss": 0.6923, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.007132339291274548, + "rewards/margins": 0.0017693456029519439, + "rewards/rejected": 0.005362994037568569, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 3.5625, + "learning_rate": 3.267973856209151e-07, + "logits/chosen": -2.5332558155059814, + "logits/rejected": -2.395725965499878, + "logps/chosen": -268.88641357421875, + "logps/rejected": -231.83193969726562, + "loss": 0.6921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.010614831931889057, + "rewards/margins": 0.002073808806017041, + "rewards/rejected": 0.00854102335870266, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 3.328125, + "learning_rate": 3.921568627450981e-07, + "logits/chosen": -2.486255168914795, + "logits/rejected": -2.4650115966796875, + "logps/chosen": -304.43609619140625, + "logps/rejected": -281.531005859375, + "loss": 0.692, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0181833915412426, + "rewards/margins": 0.0023597306571900845, + "rewards/rejected": 0.015823662281036377, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.453125, + "learning_rate": 4.5751633986928105e-07, + "logits/chosen": -2.468477249145508, + "logits/rejected": -2.358503818511963, + "logps/chosen": -274.7584228515625, + "logps/rejected": -262.47967529296875, + "loss": 0.6893, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.023093093186616898, + "rewards/margins": 0.00769047299399972, + "rewards/rejected": 0.015402620658278465, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 3.578125, + "learning_rate": 5.228758169934641e-07, + "logits/chosen": -2.4944674968719482, + "logits/rejected": -2.388322353363037, + "logps/chosen": -281.0355529785156, + "logps/rejected": -250.0665283203125, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.030989915132522583, + "rewards/margins": 0.00816916860640049, + "rewards/rejected": 0.022820744663476944, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 2.90625, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -2.54004168510437, + "logits/rejected": -2.4396867752075195, + "logps/chosen": -264.7198181152344, + "logps/rejected": -234.16793823242188, + "loss": 0.69, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.036226507276296616, + "rewards/margins": 0.006393597926944494, + "rewards/rejected": 0.029832908883690834, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 3.421875, + "learning_rate": 6.535947712418302e-07, + "logits/chosen": -2.6216228008270264, + "logits/rejected": -2.5309205055236816, + "logps/chosen": -265.4188537597656, + "logps/rejected": -224.04837036132812, + "loss": 0.6885, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04085296764969826, + "rewards/margins": 0.009448667988181114, + "rewards/rejected": 0.031404297798871994, + "step": 100 + }, + { + "epoch": 0.01, + "eval_logits/chosen": -2.386389970779419, + "eval_logits/rejected": -2.301281452178955, + "eval_logps/chosen": -260.609619140625, + "eval_logps/rejected": -241.47633361816406, + "eval_loss": 0.6887126564979553, + "eval_rewards/accuracies": 0.6154999732971191, + "eval_rewards/chosen": 0.04011417552828789, + "eval_rewards/margins": 0.009088212624192238, + "eval_rewards/rejected": 0.031025957316160202, + "eval_runtime": 1591.0686, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 2.640625, + "learning_rate": 7.189542483660131e-07, + "logits/chosen": -2.4524688720703125, + "logits/rejected": -2.3316445350646973, + "logps/chosen": -242.79489135742188, + "logps/rejected": -221.9463653564453, + "loss": 0.6879, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04074569046497345, + "rewards/margins": 0.010795327834784985, + "rewards/rejected": 0.02995036169886589, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 3.65625, + "learning_rate": 7.843137254901962e-07, + "logits/chosen": -2.6706340312957764, + "logits/rejected": -2.5337891578674316, + "logps/chosen": -304.76055908203125, + "logps/rejected": -252.02218627929688, + "loss": 0.6863, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04983269050717354, + "rewards/margins": 0.014050389640033245, + "rewards/rejected": 0.03578229993581772, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 3.875, + "learning_rate": 8.496732026143792e-07, + "logits/chosen": -2.52778959274292, + "logits/rejected": -2.411165714263916, + "logps/chosen": -310.9256286621094, + "logps/rejected": -255.0240478515625, + "loss": 0.6857, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.050248682498931885, + "rewards/margins": 0.01527202595025301, + "rewards/rejected": 0.03497665375471115, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 2.828125, + "learning_rate": 9.150326797385621e-07, + "logits/chosen": -2.5127735137939453, + "logits/rejected": -2.426474094390869, + "logps/chosen": -238.1068878173828, + "logps/rejected": -235.57666015625, + "loss": 0.6877, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.04580515995621681, + "rewards/margins": 0.011373082175850868, + "rewards/rejected": 0.034432075917720795, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 2.875, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -2.514488697052002, + "logits/rejected": -2.4246912002563477, + "logps/chosen": -253.14688110351562, + "logps/rejected": -230.44039916992188, + "loss": 0.6849, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04800596088171005, + "rewards/margins": 0.0169829074293375, + "rewards/rejected": 0.0310230515897274, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 3.609375, + "learning_rate": 1.0457516339869283e-06, + "logits/chosen": -2.6019513607025146, + "logits/rejected": -2.48175048828125, + "logps/chosen": -250.6589813232422, + "logps/rejected": -221.22116088867188, + "loss": 0.6851, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05127542465925217, + "rewards/margins": 0.016976820304989815, + "rewards/rejected": 0.0342986062169075, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 2.796875, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": -2.562800407409668, + "logits/rejected": -2.4376182556152344, + "logps/chosen": -264.79949951171875, + "logps/rejected": -236.85531616210938, + "loss": 0.683, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.05695644021034241, + "rewards/margins": 0.02115512825548649, + "rewards/rejected": 0.03580131381750107, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 2.6875, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": -2.595853567123413, + "logits/rejected": -2.445687770843506, + "logps/chosen": -253.2369842529297, + "logps/rejected": -239.16470336914062, + "loss": 0.6793, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05581844970583916, + "rewards/margins": 0.029622327536344528, + "rewards/rejected": 0.026196125894784927, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 3.59375, + "learning_rate": 1.2418300653594772e-06, + "logits/chosen": -2.59649395942688, + "logits/rejected": -2.5004138946533203, + "logps/chosen": -260.62255859375, + "logps/rejected": -233.1819610595703, + "loss": 0.6771, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05944965034723282, + "rewards/margins": 0.03332919254899025, + "rewards/rejected": 0.02612045407295227, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 3.203125, + "learning_rate": 1.3071895424836604e-06, + "logits/chosen": -2.5189108848571777, + "logits/rejected": -2.487696409225464, + "logps/chosen": -228.7367401123047, + "logps/rejected": -243.255859375, + "loss": 0.6826, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04864143207669258, + "rewards/margins": 0.02205492928624153, + "rewards/rejected": 0.02658650279045105, + "step": 200 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.379215955734253, + "eval_logits/rejected": -2.2938711643218994, + "eval_logps/chosen": -259.2414855957031, + "eval_logps/rejected": -242.49423217773438, + "eval_loss": 0.6777375936508179, + "eval_rewards/accuracies": 0.6554999947547913, + "eval_rewards/chosen": 0.05379528924822807, + "eval_rewards/margins": 0.03294837102293968, + "eval_rewards/rejected": 0.02084691822528839, + "eval_runtime": 1592.4448, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 3.4375, + "learning_rate": 1.3725490196078434e-06, + "logits/chosen": -2.5041604042053223, + "logits/rejected": -2.4234907627105713, + "logps/chosen": -257.89825439453125, + "logps/rejected": -264.8030090332031, + "loss": 0.6747, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.049550481140613556, + "rewards/margins": 0.038632601499557495, + "rewards/rejected": 0.01091787964105606, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 2.65625, + "learning_rate": 1.4379084967320261e-06, + "logits/chosen": -2.511364698410034, + "logits/rejected": -2.390717029571533, + "logps/chosen": -214.1625213623047, + "logps/rejected": -208.02676391601562, + "loss": 0.6712, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.047859933227300644, + "rewards/margins": 0.046445488929748535, + "rewards/rejected": 0.0014144459273666143, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 3.078125, + "learning_rate": 1.5032679738562091e-06, + "logits/chosen": -2.6636946201324463, + "logits/rejected": -2.510026454925537, + "logps/chosen": -330.6913146972656, + "logps/rejected": -265.2884216308594, + "loss": 0.6669, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06970520317554474, + "rewards/margins": 0.0569244809448719, + "rewards/rejected": 0.012780720368027687, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 3.875, + "learning_rate": 1.5686274509803923e-06, + "logits/chosen": -2.523871421813965, + "logits/rejected": -2.399880886077881, + "logps/chosen": -249.8741455078125, + "logps/rejected": -195.24508666992188, + "loss": 0.6542, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05200767517089844, + "rewards/margins": 0.08412192761898041, + "rewards/rejected": -0.03211425989866257, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 3.546875, + "learning_rate": 1.6339869281045753e-06, + "logits/chosen": -2.488612174987793, + "logits/rejected": -2.384747266769409, + "logps/chosen": -273.90985107421875, + "logps/rejected": -223.31674194335938, + "loss": 0.6547, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.0524408221244812, + "rewards/margins": 0.08312702178955078, + "rewards/rejected": -0.03068619966506958, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 3.96875, + "learning_rate": 1.6993464052287585e-06, + "logits/chosen": -2.437614679336548, + "logits/rejected": -2.3234875202178955, + "logps/chosen": -271.07061767578125, + "logps/rejected": -242.3330535888672, + "loss": 0.6587, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.012964209541678429, + "rewards/margins": 0.07740563899278641, + "rewards/rejected": -0.06444142013788223, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 3.53125, + "learning_rate": 1.7647058823529414e-06, + "logits/chosen": -2.459193468093872, + "logits/rejected": -2.414381980895996, + "logps/chosen": -232.7344970703125, + "logps/rejected": -245.03836059570312, + "loss": 0.6609, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.025754287838935852, + "rewards/margins": 0.07392101734876633, + "rewards/rejected": -0.09967531263828278, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 3.546875, + "learning_rate": 1.8300653594771242e-06, + "logits/chosen": -2.3779327869415283, + "logits/rejected": -2.3616323471069336, + "logps/chosen": -235.00436401367188, + "logps/rejected": -238.427978515625, + "loss": 0.6686, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.0035261516459286213, + "rewards/margins": 0.06117261201143265, + "rewards/rejected": -0.05764646455645561, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 3.84375, + "learning_rate": 1.8954248366013072e-06, + "logits/chosen": -2.4707655906677246, + "logits/rejected": -2.3777573108673096, + "logps/chosen": -292.0356140136719, + "logps/rejected": -266.07550048828125, + "loss": 0.6614, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.013245267793536186, + "rewards/margins": 0.07698690891265869, + "rewards/rejected": -0.06374163925647736, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 4.34375, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -2.4952127933502197, + "logits/rejected": -2.3478877544403076, + "logps/chosen": -272.2964172363281, + "logps/rejected": -271.04937744140625, + "loss": 0.6623, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.06571820378303528, + "rewards/margins": 0.07633010298013687, + "rewards/rejected": -0.14204831421375275, + "step": 300 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -2.3202240467071533, + "eval_logits/rejected": -2.2310445308685303, + "eval_logps/chosen": -273.9336853027344, + "eval_logps/rejected": -262.1588439941406, + "eval_loss": 0.6578417420387268, + "eval_rewards/accuracies": 0.6735000014305115, + "eval_rewards/chosen": -0.09312662482261658, + "eval_rewards/margins": 0.08267267793416977, + "eval_rewards/rejected": -0.17579929530620575, + "eval_runtime": 1595.4881, + "eval_samples_per_second": 1.254, + "eval_steps_per_second": 0.313, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 4.8125, + "learning_rate": 2.0261437908496734e-06, + "logits/chosen": -2.4691600799560547, + "logits/rejected": -2.4113526344299316, + "logps/chosen": -294.3885803222656, + "logps/rejected": -286.75103759765625, + "loss": 0.6492, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.05116035416722298, + "rewards/margins": 0.10192099958658218, + "rewards/rejected": -0.15308134257793427, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 4.78125, + "learning_rate": 2.0915032679738565e-06, + "logits/chosen": -2.4071547985076904, + "logits/rejected": -2.2507896423339844, + "logps/chosen": -257.3243713378906, + "logps/rejected": -214.6096954345703, + "loss": 0.6449, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.00719846785068512, + "rewards/margins": 0.11425099521875381, + "rewards/rejected": -0.10705254226922989, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 4.125, + "learning_rate": 2.1568627450980393e-06, + "logits/chosen": -2.530496120452881, + "logits/rejected": -2.3858256340026855, + "logps/chosen": -315.06976318359375, + "logps/rejected": -302.36773681640625, + "loss": 0.639, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.06672513484954834, + "rewards/margins": 0.12611213326454163, + "rewards/rejected": -0.19283726811408997, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 4.84375, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": -2.3970444202423096, + "logits/rejected": -2.2407002449035645, + "logps/chosen": -263.9761657714844, + "logps/rejected": -242.84762573242188, + "loss": 0.6657, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21234241127967834, + "rewards/margins": 0.06845516711473465, + "rewards/rejected": -0.2807976007461548, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 4.125, + "learning_rate": 2.2875816993464053e-06, + "logits/chosen": -2.485644578933716, + "logits/rejected": -2.405118703842163, + "logps/chosen": -264.986328125, + "logps/rejected": -254.9670867919922, + "loss": 0.6408, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06968159228563309, + "rewards/margins": 0.12432824075222015, + "rewards/rejected": -0.19400982558727264, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 3.828125, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -2.366410732269287, + "logits/rejected": -2.3023273944854736, + "logps/chosen": -270.1907653808594, + "logps/rejected": -266.2153015136719, + "loss": 0.6524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.003988529555499554, + "rewards/margins": 0.09547580033540726, + "rewards/rejected": -0.09148726612329483, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 4.15625, + "learning_rate": 2.4183006535947716e-06, + "logits/chosen": -2.369810104370117, + "logits/rejected": -2.2653660774230957, + "logps/chosen": -234.8569793701172, + "logps/rejected": -219.356689453125, + "loss": 0.6548, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.0072977119125425816, + "rewards/margins": 0.0890003889799118, + "rewards/rejected": -0.09629810601472855, + "step": 370 + }, + { + "epoch": 0.05, + "grad_norm": 5.3125, + "learning_rate": 2.4836601307189544e-06, + "logits/chosen": -2.3556571006774902, + "logits/rejected": -2.259316921234131, + "logps/chosen": -281.4800109863281, + "logps/rejected": -286.7106628417969, + "loss": 0.6479, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0648920014500618, + "rewards/margins": 0.11231891065835953, + "rewards/rejected": -0.17721091210842133, + "step": 380 + }, + { + "epoch": 0.05, + "grad_norm": 4.75, + "learning_rate": 2.549019607843137e-06, + "logits/chosen": -2.312218189239502, + "logits/rejected": -2.1933465003967285, + "logps/chosen": -274.30120849609375, + "logps/rejected": -267.49444580078125, + "loss": 0.645, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.11016625165939331, + "rewards/margins": 0.12351493537425995, + "rewards/rejected": -0.23368120193481445, + "step": 390 + }, + { + "epoch": 0.05, + "grad_norm": 9.875, + "learning_rate": 2.6143790849673208e-06, + "logits/chosen": -2.315176248550415, + "logits/rejected": -2.2708230018615723, + "logps/chosen": -243.77029418945312, + "logps/rejected": -249.6044921875, + "loss": 0.6619, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.26674699783325195, + "rewards/margins": 0.08755677193403244, + "rewards/rejected": -0.354303777217865, + "step": 400 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.14410400390625, + "eval_logits/rejected": -2.030946969985962, + "eval_logps/chosen": -294.5643615722656, + "eval_logps/rejected": -286.9753723144531, + "eval_loss": 0.6454918384552002, + "eval_rewards/accuracies": 0.6610000133514404, + "eval_rewards/chosen": -0.29943349957466125, + "eval_rewards/margins": 0.12453118711709976, + "eval_rewards/rejected": -0.4239646792411804, + "eval_runtime": 1594.2847, + "eval_samples_per_second": 1.254, + "eval_steps_per_second": 0.314, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 4.03125, + "learning_rate": 2.6797385620915036e-06, + "logits/chosen": -2.3303027153015137, + "logits/rejected": -2.211501359939575, + "logps/chosen": -266.9333190917969, + "logps/rejected": -248.0369873046875, + "loss": 0.6583, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.34348687529563904, + "rewards/margins": 0.08375416696071625, + "rewards/rejected": -0.4272410273551941, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 4.5625, + "learning_rate": 2.7450980392156867e-06, + "logits/chosen": -2.3430421352386475, + "logits/rejected": -2.0668082237243652, + "logps/chosen": -336.4903564453125, + "logps/rejected": -275.1713562011719, + "loss": 0.6576, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.41029343008995056, + "rewards/margins": 0.09131225943565369, + "rewards/rejected": -0.501605749130249, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 4.59375, + "learning_rate": 2.8104575163398695e-06, + "logits/chosen": -2.378570079803467, + "logits/rejected": -2.128157138824463, + "logps/chosen": -343.38433837890625, + "logps/rejected": -285.4502258300781, + "loss": 0.6281, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.38465428352355957, + "rewards/margins": 0.15774312615394592, + "rewards/rejected": -0.5423974394798279, + "step": 430 + }, + { + "epoch": 0.06, + "grad_norm": 4.09375, + "learning_rate": 2.8758169934640523e-06, + "logits/chosen": -2.3335065841674805, + "logits/rejected": -2.3891799449920654, + "logps/chosen": -266.10101318359375, + "logps/rejected": -275.11773681640625, + "loss": 0.6645, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3828270733356476, + "rewards/margins": 0.08971767127513885, + "rewards/rejected": -0.47254472970962524, + "step": 440 + }, + { + "epoch": 0.06, + "grad_norm": 4.65625, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -2.360802412033081, + "logits/rejected": -2.125321865081787, + "logps/chosen": -305.62811279296875, + "logps/rejected": -278.4323425292969, + "loss": 0.6356, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3769600987434387, + "rewards/margins": 0.14625975489616394, + "rewards/rejected": -0.523219883441925, + "step": 450 + }, + { + "epoch": 0.06, + "grad_norm": 5.28125, + "learning_rate": 3.0065359477124182e-06, + "logits/chosen": -2.2305850982666016, + "logits/rejected": -2.1131339073181152, + "logps/chosen": -325.5186462402344, + "logps/rejected": -308.9561767578125, + "loss": 0.6336, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.43926286697387695, + "rewards/margins": 0.1619986593723297, + "rewards/rejected": -0.6012614965438843, + "step": 460 + }, + { + "epoch": 0.06, + "grad_norm": 5.28125, + "learning_rate": 3.071895424836602e-06, + "logits/chosen": -2.356358051300049, + "logits/rejected": -2.2044506072998047, + "logps/chosen": -292.234130859375, + "logps/rejected": -312.48565673828125, + "loss": 0.6471, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4053603708744049, + "rewards/margins": 0.1308523416519165, + "rewards/rejected": -0.5362127423286438, + "step": 470 + }, + { + "epoch": 0.06, + "grad_norm": 7.1875, + "learning_rate": 3.1372549019607846e-06, + "logits/chosen": -2.2809040546417236, + "logits/rejected": -2.200786590576172, + "logps/chosen": -255.33657836914062, + "logps/rejected": -276.2983093261719, + "loss": 0.6495, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2522006928920746, + "rewards/margins": 0.13950569927692413, + "rewards/rejected": -0.3917064070701599, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 5.28125, + "learning_rate": 3.2026143790849674e-06, + "logits/chosen": -2.4270236492156982, + "logits/rejected": -2.225928783416748, + "logps/chosen": -251.0017547607422, + "logps/rejected": -223.0763397216797, + "loss": 0.6329, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2193588763475418, + "rewards/margins": 0.17358329892158508, + "rewards/rejected": -0.3929421603679657, + "step": 490 + }, + { + "epoch": 0.07, + "grad_norm": 5.1875, + "learning_rate": 3.2679738562091506e-06, + "logits/chosen": -2.4462802410125732, + "logits/rejected": -2.2391388416290283, + "logps/chosen": -285.2283935546875, + "logps/rejected": -278.61834716796875, + "loss": 0.6257, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.30025702714920044, + "rewards/margins": 0.18700367212295532, + "rewards/rejected": -0.4872606694698334, + "step": 500 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.1485366821289062, + "eval_logits/rejected": -2.0400145053863525, + "eval_logps/chosen": -299.84417724609375, + "eval_logps/rejected": -300.69671630859375, + "eval_loss": 0.6193849444389343, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": -0.35223132371902466, + "eval_rewards/margins": 0.20894668996334076, + "eval_rewards/rejected": -0.5611779689788818, + "eval_runtime": 1595.3344, + "eval_samples_per_second": 1.254, + "eval_steps_per_second": 0.313, + "step": 500 + }, + { + "epoch": 0.07, + "grad_norm": 6.3125, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -2.2720694541931152, + "logits/rejected": -2.0909745693206787, + "logps/chosen": -319.68511962890625, + "logps/rejected": -312.24517822265625, + "loss": 0.6034, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3121896982192993, + "rewards/margins": 0.25317278504371643, + "rewards/rejected": -0.5653624534606934, + "step": 510 + }, + { + "epoch": 0.07, + "grad_norm": 5.65625, + "learning_rate": 3.398692810457517e-06, + "logits/chosen": -2.4018216133117676, + "logits/rejected": -2.334257125854492, + "logps/chosen": -304.389404296875, + "logps/rejected": -286.22607421875, + "loss": 0.6273, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.311653196811676, + "rewards/margins": 0.19162270426750183, + "rewards/rejected": -0.503275990486145, + "step": 520 + }, + { + "epoch": 0.07, + "grad_norm": 5.875, + "learning_rate": 3.4640522875816997e-06, + "logits/chosen": -2.320746421813965, + "logits/rejected": -2.172206401824951, + "logps/chosen": -301.77679443359375, + "logps/rejected": -298.7806701660156, + "loss": 0.5768, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4527795910835266, + "rewards/margins": 0.2991865277290344, + "rewards/rejected": -0.751966118812561, + "step": 530 + }, + { + "epoch": 0.07, + "grad_norm": 5.6875, + "learning_rate": 3.529411764705883e-06, + "logits/chosen": -2.257561445236206, + "logits/rejected": -2.160403251647949, + "logps/chosen": -344.9192199707031, + "logps/rejected": -380.3932189941406, + "loss": 0.5383, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6207869648933411, + "rewards/margins": 0.4684711992740631, + "rewards/rejected": -1.0892581939697266, + "step": 540 + }, + { + "epoch": 0.07, + "grad_norm": 6.0, + "learning_rate": 3.5947712418300657e-06, + "logits/chosen": -2.241232395172119, + "logits/rejected": -1.9481990337371826, + "logps/chosen": -365.34429931640625, + "logps/rejected": -332.47369384765625, + "loss": 0.6163, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5884149670600891, + "rewards/margins": 0.3151562809944153, + "rewards/rejected": -0.9035712480545044, + "step": 550 + }, + { + "epoch": 0.07, + "grad_norm": 5.3125, + "learning_rate": 3.6601307189542484e-06, + "logits/chosen": -2.107689142227173, + "logits/rejected": -2.058168888092041, + "logps/chosen": -286.3569641113281, + "logps/rejected": -329.6611633300781, + "loss": 0.5914, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5777391791343689, + "rewards/margins": 0.3210974335670471, + "rewards/rejected": -0.8988364934921265, + "step": 560 + }, + { + "epoch": 0.07, + "grad_norm": 8.125, + "learning_rate": 3.7254901960784316e-06, + "logits/chosen": -2.1557562351226807, + "logits/rejected": -2.2561655044555664, + "logps/chosen": -304.96063232421875, + "logps/rejected": -351.2149353027344, + "loss": 0.5715, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4714527726173401, + "rewards/margins": 0.3753376305103302, + "rewards/rejected": -0.8467904329299927, + "step": 570 + }, + { + "epoch": 0.08, + "grad_norm": 12.125, + "learning_rate": 3.7908496732026144e-06, + "logits/chosen": -1.9725488424301147, + "logits/rejected": -1.8165016174316406, + "logps/chosen": -353.0700378417969, + "logps/rejected": -351.30902099609375, + "loss": 0.5807, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5593926310539246, + "rewards/margins": 0.3593262732028961, + "rewards/rejected": -0.9187189340591431, + "step": 580 + }, + { + "epoch": 0.08, + "grad_norm": 19.75, + "learning_rate": 3.856209150326798e-06, + "logits/chosen": -1.9927419424057007, + "logits/rejected": -1.7365013360977173, + "logps/chosen": -356.13531494140625, + "logps/rejected": -363.0181579589844, + "loss": 0.6351, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7341691851615906, + "rewards/margins": 0.30337682366371155, + "rewards/rejected": -1.037545919418335, + "step": 590 + }, + { + "epoch": 0.08, + "grad_norm": 5.15625, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -1.9878699779510498, + "logits/rejected": -1.9630234241485596, + "logps/chosen": -298.11883544921875, + "logps/rejected": -349.0447692871094, + "loss": 0.6114, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5508363842964172, + "rewards/margins": 0.29506638646125793, + "rewards/rejected": -0.8459027409553528, + "step": 600 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -1.719992756843567, + "eval_logits/rejected": -1.550325632095337, + "eval_logps/chosen": -327.6963806152344, + "eval_logps/rejected": -340.6011962890625, + "eval_loss": 0.6003695130348206, + "eval_rewards/accuracies": 0.6754999756813049, + "eval_rewards/chosen": -0.6307531595230103, + "eval_rewards/margins": 0.3294694721698761, + "eval_rewards/rejected": -0.960222601890564, + "eval_runtime": 1594.9578, + "eval_samples_per_second": 1.254, + "eval_steps_per_second": 0.313, + "step": 600 + }, + { + "epoch": 0.08, + "grad_norm": 9.4375, + "learning_rate": 3.986928104575164e-06, + "logits/chosen": -2.0066254138946533, + "logits/rejected": -1.6175451278686523, + "logps/chosen": -334.5193786621094, + "logps/rejected": -315.98345947265625, + "loss": 0.594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7500374913215637, + "rewards/margins": 0.318537175655365, + "rewards/rejected": -1.0685746669769287, + "step": 610 + }, + { + "epoch": 0.08, + "grad_norm": 7.46875, + "learning_rate": 4.052287581699347e-06, + "logits/chosen": -1.4493825435638428, + "logits/rejected": -1.4420878887176514, + "logps/chosen": -362.75543212890625, + "logps/rejected": -412.0641174316406, + "loss": 0.5098, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9031947255134583, + "rewards/margins": 0.5772475600242615, + "rewards/rejected": -1.4804422855377197, + "step": 620 + }, + { + "epoch": 0.08, + "grad_norm": 13.75, + "learning_rate": 4.11764705882353e-06, + "logits/chosen": -1.7892284393310547, + "logits/rejected": -1.3057540655136108, + "logps/chosen": -373.5083923339844, + "logps/rejected": -370.4551086425781, + "loss": 0.6099, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9319362640380859, + "rewards/margins": 0.3402162194252014, + "rewards/rejected": -1.2721525430679321, + "step": 630 + }, + { + "epoch": 0.08, + "grad_norm": 6.375, + "learning_rate": 4.183006535947713e-06, + "logits/chosen": -1.5417096614837646, + "logits/rejected": -1.0605213642120361, + "logps/chosen": -311.5467529296875, + "logps/rejected": -309.4730224609375, + "loss": 0.6052, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7302576899528503, + "rewards/margins": 0.36248451471328735, + "rewards/rejected": -1.0927422046661377, + "step": 640 + }, + { + "epoch": 0.09, + "grad_norm": 6.75, + "learning_rate": 4.2483660130718954e-06, + "logits/chosen": -1.3266146183013916, + "logits/rejected": -0.8559074401855469, + "logps/chosen": -351.74542236328125, + "logps/rejected": -339.4545593261719, + "loss": 0.5848, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7287541627883911, + "rewards/margins": 0.41469335556030273, + "rewards/rejected": -1.1434475183486938, + "step": 650 + }, + { + "epoch": 0.09, + "grad_norm": 8.5625, + "learning_rate": 4.313725490196079e-06, + "logits/chosen": -1.0535982847213745, + "logits/rejected": -0.5663085579872131, + "logps/chosen": -340.8390197753906, + "logps/rejected": -378.7320556640625, + "loss": 0.5298, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7466461062431335, + "rewards/margins": 0.5064037442207336, + "rewards/rejected": -1.2530498504638672, + "step": 660 + }, + { + "epoch": 0.09, + "grad_norm": 10.375, + "learning_rate": 4.379084967320262e-06, + "logits/chosen": -0.47771701216697693, + "logits/rejected": -0.5111185312271118, + "logps/chosen": -349.4082946777344, + "logps/rejected": -397.6182556152344, + "loss": 0.5864, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.225978136062622, + "rewards/margins": 0.3780650496482849, + "rewards/rejected": -1.6040430068969727, + "step": 670 + }, + { + "epoch": 0.09, + "grad_norm": 6.84375, + "learning_rate": 4.444444444444444e-06, + "logits/chosen": -0.7972174286842346, + "logits/rejected": -0.40817341208457947, + "logps/chosen": -455.7498474121094, + "logps/rejected": -472.62994384765625, + "loss": 0.5718, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6846425533294678, + "rewards/margins": 0.46953901648521423, + "rewards/rejected": -2.154181480407715, + "step": 680 + }, + { + "epoch": 0.09, + "grad_norm": 9.5625, + "learning_rate": 4.509803921568628e-06, + "logits/chosen": -0.33812543749809265, + "logits/rejected": -0.39494821429252625, + "logps/chosen": -400.83795166015625, + "logps/rejected": -469.98486328125, + "loss": 0.5862, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6113307476043701, + "rewards/margins": 0.44311681389808655, + "rewards/rejected": -2.054447650909424, + "step": 690 + }, + { + "epoch": 0.09, + "grad_norm": 11.125, + "learning_rate": 4.5751633986928105e-06, + "logits/chosen": -1.0038961172103882, + "logits/rejected": -0.4275578558444977, + "logps/chosen": -435.5682678222656, + "logps/rejected": -460.2854919433594, + "loss": 0.5394, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5751301050186157, + "rewards/margins": 0.5817869901657104, + "rewards/rejected": -2.156917095184326, + "step": 700 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -0.9309160709381104, + "eval_logits/rejected": -0.6532166600227356, + "eval_logps/chosen": -421.52081298828125, + "eval_logps/rejected": -443.0096130371094, + "eval_loss": 0.6103234887123108, + "eval_rewards/accuracies": 0.6635000109672546, + "eval_rewards/chosen": -1.5689976215362549, + "eval_rewards/margins": 0.4153095483779907, + "eval_rewards/rejected": -1.9843071699142456, + "eval_runtime": 1595.2265, + "eval_samples_per_second": 1.254, + "eval_steps_per_second": 0.313, + "step": 700 + }, + { + "epoch": 0.09, + "grad_norm": 10.5625, + "learning_rate": 4.640522875816994e-06, + "logits/chosen": -1.3852497339248657, + "logits/rejected": -1.0913816690444946, + "logps/chosen": -403.0024108886719, + "logps/rejected": -451.954833984375, + "loss": 0.4959, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.197796106338501, + "rewards/margins": 0.7116587162017822, + "rewards/rejected": -1.9094547033309937, + "step": 710 + }, + { + "epoch": 0.09, + "grad_norm": 12.125, + "learning_rate": 4.705882352941177e-06, + "logits/chosen": -1.3930566310882568, + "logits/rejected": -0.8486838340759277, + "logps/chosen": -390.4463195800781, + "logps/rejected": -396.0668029785156, + "loss": 0.6395, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0593743324279785, + "rewards/margins": 0.36317670345306396, + "rewards/rejected": -1.4225510358810425, + "step": 720 + }, + { + "epoch": 0.1, + "grad_norm": 10.8125, + "learning_rate": 4.77124183006536e-06, + "logits/chosen": -1.3652091026306152, + "logits/rejected": -0.6764923334121704, + "logps/chosen": -348.77593994140625, + "logps/rejected": -372.30548095703125, + "loss": 0.5272, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5752251744270325, + "rewards/margins": 0.49377313256263733, + "rewards/rejected": -1.0689985752105713, + "step": 730 + }, + { + "epoch": 0.1, + "grad_norm": 10.75, + "learning_rate": 4.836601307189543e-06, + "logits/chosen": -0.3494882583618164, + "logits/rejected": -0.1411532610654831, + "logps/chosen": -328.62261962890625, + "logps/rejected": -353.97772216796875, + "loss": 0.5619, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8632782697677612, + "rewards/margins": 0.5012267827987671, + "rewards/rejected": -1.3645050525665283, + "step": 740 + }, + { + "epoch": 0.1, + "grad_norm": 9.0, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": 0.3033140301704407, + "logits/rejected": 0.7218297719955444, + "logps/chosen": -423.762451171875, + "logps/rejected": -459.82977294921875, + "loss": 0.5727, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9222949743270874, + "rewards/margins": 0.4701816141605377, + "rewards/rejected": -2.3924765586853027, + "step": 750 + }, + { + "epoch": 0.1, + "grad_norm": 9.25, + "learning_rate": 4.967320261437909e-06, + "logits/chosen": -0.20161783695220947, + "logits/rejected": 0.31453028321266174, + "logps/chosen": -453.60162353515625, + "logps/rejected": -451.3170471191406, + "loss": 0.6744, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9592533111572266, + "rewards/margins": 0.2076384574174881, + "rewards/rejected": -2.1668918132781982, + "step": 760 + }, + { + "epoch": 0.1, + "grad_norm": 12.875, + "learning_rate": 4.99999347843947e-06, + "logits/chosen": -1.8158515691757202, + "logits/rejected": -1.601609230041504, + "logps/chosen": -350.30145263671875, + "logps/rejected": -348.83441162109375, + "loss": 0.5921, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.057814359664917, + "rewards/margins": 0.3119596242904663, + "rewards/rejected": -1.3697741031646729, + "step": 770 + }, + { + "epoch": 0.1, + "grad_norm": 7.625, + "learning_rate": 4.999941306159375e-06, + "logits/chosen": -2.322629451751709, + "logits/rejected": -2.3233141899108887, + "logps/chosen": -302.6885986328125, + "logps/rejected": -347.59197998046875, + "loss": 0.5802, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5237258672714233, + "rewards/margins": 0.384928822517395, + "rewards/rejected": -0.9086545705795288, + "step": 780 + }, + { + "epoch": 0.1, + "grad_norm": 13.0625, + "learning_rate": 4.999836962687967e-06, + "logits/chosen": -1.4795883893966675, + "logits/rejected": -1.3030979633331299, + "logps/chosen": -359.45623779296875, + "logps/rejected": -399.82476806640625, + "loss": 0.6045, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2977018356323242, + "rewards/margins": 0.38991469144821167, + "rewards/rejected": -1.6876163482666016, + "step": 790 + }, + { + "epoch": 0.1, + "grad_norm": 6.9375, + "learning_rate": 4.999680450202786e-06, + "logits/chosen": -0.6206024289131165, + "logits/rejected": -0.4429781436920166, + "logps/chosen": -472.45892333984375, + "logps/rejected": -505.4447326660156, + "loss": 0.6171, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7535765171051025, + "rewards/margins": 0.31215736269950867, + "rewards/rejected": -2.0657336711883545, + "step": 800 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -0.3348754942417145, + "eval_logits/rejected": 0.023464158177375793, + "eval_logps/chosen": -440.0762023925781, + "eval_logps/rejected": -450.9858093261719, + "eval_loss": 0.6371665000915527, + "eval_rewards/accuracies": 0.640500009059906, + "eval_rewards/chosen": -1.7545514106750488, + "eval_rewards/margins": 0.3095169961452484, + "eval_rewards/rejected": -2.06406831741333, + "eval_runtime": 1593.2658, + "eval_samples_per_second": 1.255, + "eval_steps_per_second": 0.314, + "step": 800 + }, + { + "epoch": 0.11, + "grad_norm": 14.9375, + "learning_rate": 4.999471771970087e-06, + "logits/chosen": -0.6086374521255493, + "logits/rejected": -0.3623660206794739, + "logps/chosen": -411.4615783691406, + "logps/rejected": -428.5804748535156, + "loss": 0.6063, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6526752710342407, + "rewards/margins": 0.3578190803527832, + "rewards/rejected": -2.0104942321777344, + "step": 810 + }, + { + "epoch": 0.11, + "grad_norm": 12.8125, + "learning_rate": 4.999210932344767e-06, + "logits/chosen": -1.3322651386260986, + "logits/rejected": -1.328564167022705, + "logps/chosen": -397.54046630859375, + "logps/rejected": -491.22869873046875, + "loss": 0.6269, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5655710697174072, + "rewards/margins": 0.4129908084869385, + "rewards/rejected": -1.9785619974136353, + "step": 820 + }, + { + "epoch": 0.11, + "grad_norm": 9.5, + "learning_rate": 4.998897936770281e-06, + "logits/chosen": -1.9537856578826904, + "logits/rejected": -1.8272556066513062, + "logps/chosen": -436.32110595703125, + "logps/rejected": -406.75750732421875, + "loss": 0.5976, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.332761526107788, + "rewards/margins": 0.4022786021232605, + "rewards/rejected": -1.735040307044983, + "step": 830 + }, + { + "epoch": 0.11, + "grad_norm": 9.3125, + "learning_rate": 4.998532791778521e-06, + "logits/chosen": -2.0961689949035645, + "logits/rejected": -2.0401535034179688, + "logps/chosen": -322.5164489746094, + "logps/rejected": -366.49139404296875, + "loss": 0.6556, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9384282827377319, + "rewards/margins": 0.1931869089603424, + "rewards/rejected": -1.1316152811050415, + "step": 840 + }, + { + "epoch": 0.11, + "grad_norm": 6.40625, + "learning_rate": 4.9981155049896885e-06, + "logits/chosen": -1.9329078197479248, + "logits/rejected": -1.8825485706329346, + "logps/chosen": -370.23004150390625, + "logps/rejected": -405.8755798339844, + "loss": 0.5015, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8216459155082703, + "rewards/margins": 0.5655540823936462, + "rewards/rejected": -1.387199878692627, + "step": 850 + }, + { + "epoch": 0.11, + "grad_norm": 10.3125, + "learning_rate": 4.997646085112126e-06, + "logits/chosen": -1.2966748476028442, + "logits/rejected": -1.0200374126434326, + "logps/chosen": -397.69146728515625, + "logps/rejected": -391.2544860839844, + "loss": 0.6826, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3053414821624756, + "rewards/margins": 0.20140385627746582, + "rewards/rejected": -1.506745457649231, + "step": 860 + }, + { + "epoch": 0.11, + "grad_norm": 12.9375, + "learning_rate": 4.997124541942141e-06, + "logits/chosen": -0.5647540092468262, + "logits/rejected": -0.14192932844161987, + "logps/chosen": -369.59539794921875, + "logps/rejected": -405.70843505859375, + "loss": 0.6578, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4736089706420898, + "rewards/margins": 0.299024760723114, + "rewards/rejected": -1.7726337909698486, + "step": 870 + }, + { + "epoch": 0.12, + "grad_norm": 6.34375, + "learning_rate": 4.996550886363801e-06, + "logits/chosen": -0.5589116811752319, + "logits/rejected": -0.307116836309433, + "logps/chosen": -302.707275390625, + "logps/rejected": -328.901123046875, + "loss": 0.5766, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9649427533149719, + "rewards/margins": 0.3963368535041809, + "rewards/rejected": -1.3612796068191528, + "step": 880 + }, + { + "epoch": 0.12, + "grad_norm": 9.625, + "learning_rate": 4.995925130348706e-06, + "logits/chosen": 0.031105151399970055, + "logits/rejected": 0.044142670929431915, + "logps/chosen": -287.320068359375, + "logps/rejected": -327.9627990722656, + "loss": 0.6079, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.937950611114502, + "rewards/margins": 0.3377194404602051, + "rewards/rejected": -1.2756701707839966, + "step": 890 + }, + { + "epoch": 0.12, + "grad_norm": 9.4375, + "learning_rate": 4.995247286955734e-06, + "logits/chosen": 1.0088450908660889, + "logits/rejected": 1.7407630681991577, + "logps/chosen": -457.3868103027344, + "logps/rejected": -457.37579345703125, + "loss": 0.5553, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2731468677520752, + "rewards/margins": 0.5888018012046814, + "rewards/rejected": -1.8619487285614014, + "step": 900 + }, + { + "epoch": 0.12, + "eval_logits/chosen": 1.9978185892105103, + "eval_logits/rejected": 2.6186578273773193, + "eval_logps/chosen": -399.6167907714844, + "eval_logps/rejected": -429.9809265136719, + "eval_loss": 0.5687113404273987, + "eval_rewards/accuracies": 0.6930000185966492, + "eval_rewards/chosen": -1.3499573469161987, + "eval_rewards/margins": 0.5040626525878906, + "eval_rewards/rejected": -1.8540199995040894, + "eval_runtime": 1593.662, + "eval_samples_per_second": 1.255, + "eval_steps_per_second": 0.314, + "step": 900 + }, + { + "epoch": 0.12, + "grad_norm": 10.3125, + "learning_rate": 4.994517370330779e-06, + "logits/chosen": 0.6192656755447388, + "logits/rejected": 1.669968605041504, + "logps/chosen": -435.8106994628906, + "logps/rejected": -448.48309326171875, + "loss": 0.5258, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4264830350875854, + "rewards/margins": 0.6171587705612183, + "rewards/rejected": -2.043642044067383, + "step": 910 + }, + { + "epoch": 0.12, + "grad_norm": 15.6875, + "learning_rate": 4.993735395706446e-06, + "logits/chosen": 1.1262381076812744, + "logits/rejected": 1.8840898275375366, + "logps/chosen": -394.78826904296875, + "logps/rejected": -433.81988525390625, + "loss": 0.5314, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5628669261932373, + "rewards/margins": 0.6049157977104187, + "rewards/rejected": -2.167782783508301, + "step": 920 + }, + { + "epoch": 0.12, + "grad_norm": 11.25, + "learning_rate": 4.992901379401737e-06, + "logits/chosen": 0.14038251340389252, + "logits/rejected": 0.24134019017219543, + "logps/chosen": -398.86578369140625, + "logps/rejected": -472.0187072753906, + "loss": 0.5735, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3214950561523438, + "rewards/margins": 0.5916348695755005, + "rewards/rejected": -1.9131300449371338, + "step": 930 + }, + { + "epoch": 0.12, + "grad_norm": 11.75, + "learning_rate": 4.992015338821711e-06, + "logits/chosen": -0.04747029393911362, + "logits/rejected": 1.1106561422348022, + "logps/chosen": -391.7012023925781, + "logps/rejected": -389.11602783203125, + "loss": 0.6282, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.212070345878601, + "rewards/margins": 0.5102335214614868, + "rewards/rejected": -1.7223039865493774, + "step": 940 + }, + { + "epoch": 0.12, + "grad_norm": 13.375, + "learning_rate": 4.991077292457117e-06, + "logits/chosen": 1.087203860282898, + "logits/rejected": 1.1788231134414673, + "logps/chosen": -340.7041931152344, + "logps/rejected": -388.44293212890625, + "loss": 0.6201, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0653443336486816, + "rewards/margins": 0.430236279964447, + "rewards/rejected": -1.4955805540084839, + "step": 950 + }, + { + "epoch": 0.13, + "grad_norm": 7.65625, + "learning_rate": 4.990087259884016e-06, + "logits/chosen": 0.7325303554534912, + "logits/rejected": 1.3778412342071533, + "logps/chosen": -308.00799560546875, + "logps/rejected": -357.46337890625, + "loss": 0.5209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8226054310798645, + "rewards/margins": 0.6232573390007019, + "rewards/rejected": -1.445862889289856, + "step": 960 + }, + { + "epoch": 0.13, + "grad_norm": 17.875, + "learning_rate": 4.989045261763362e-06, + "logits/chosen": 1.262274980545044, + "logits/rejected": 1.1897504329681396, + "logps/chosen": -346.5185546875, + "logps/rejected": -419.4173278808594, + "loss": 0.5454, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0726191997528076, + "rewards/margins": 0.6499998569488525, + "rewards/rejected": -1.7226190567016602, + "step": 970 + }, + { + "epoch": 0.13, + "grad_norm": 10.4375, + "learning_rate": 4.98795131984058e-06, + "logits/chosen": 0.36344969272613525, + "logits/rejected": 1.0654281377792358, + "logps/chosen": -407.2535705566406, + "logps/rejected": -422.3675842285156, + "loss": 0.631, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3320995569229126, + "rewards/margins": 0.524221658706665, + "rewards/rejected": -1.856321096420288, + "step": 980 + }, + { + "epoch": 0.13, + "grad_norm": 5.875, + "learning_rate": 4.986805456945107e-06, + "logits/chosen": 1.200073480606079, + "logits/rejected": 2.0859665870666504, + "logps/chosen": -359.2008361816406, + "logps/rejected": -392.6002502441406, + "loss": 0.5752, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1315548419952393, + "rewards/margins": 0.49356216192245483, + "rewards/rejected": -1.6251170635223389, + "step": 990 + }, + { + "epoch": 0.13, + "grad_norm": 16.5, + "learning_rate": 4.985607696989919e-06, + "logits/chosen": 1.4213409423828125, + "logits/rejected": 2.354599714279175, + "logps/chosen": -426.97015380859375, + "logps/rejected": -458.9466857910156, + "loss": 0.6299, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4458622932434082, + "rewards/margins": 0.4511790871620178, + "rewards/rejected": -1.8970413208007812, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_logits/chosen": 2.715547561645508, + "eval_logits/rejected": 3.4191699028015137, + "eval_logps/chosen": -380.9113464355469, + "eval_logps/rejected": -419.2182312011719, + "eval_loss": 0.5619704723358154, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -1.1629031896591187, + "eval_rewards/margins": 0.5834897756576538, + "eval_rewards/rejected": -1.746392846107483, + "eval_runtime": 1593.8153, + "eval_samples_per_second": 1.255, + "eval_steps_per_second": 0.314, + "step": 1000 + }, + { + "epoch": 0.13, + "grad_norm": 9.5, + "learning_rate": 4.984358064971026e-06, + "logits/chosen": 1.5299071073532104, + "logits/rejected": 1.7706670761108398, + "logps/chosen": -340.0618591308594, + "logps/rejected": -408.04302978515625, + "loss": 0.5532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9399666786193848, + "rewards/margins": 0.690420925617218, + "rewards/rejected": -1.6303876638412476, + "step": 1010 + }, + { + "epoch": 0.13, + "grad_norm": 9.9375, + "learning_rate": 4.983056586966958e-06, + "logits/chosen": 0.010038676671683788, + "logits/rejected": 0.8103101849555969, + "logps/chosen": -340.81524658203125, + "logps/rejected": -335.1512145996094, + "loss": 0.563, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7835585474967957, + "rewards/margins": 0.46336811780929565, + "rewards/rejected": -1.2469266653060913, + "step": 1020 + }, + { + "epoch": 0.13, + "grad_norm": 16.5, + "learning_rate": 4.981703290138215e-06, + "logits/chosen": 1.5737102031707764, + "logits/rejected": 1.8098487854003906, + "logps/chosen": -342.5895690917969, + "logps/rejected": -421.34698486328125, + "loss": 0.5058, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0919854640960693, + "rewards/margins": 0.6574614644050598, + "rewards/rejected": -1.7494471073150635, + "step": 1030 + }, + { + "epoch": 0.14, + "grad_norm": 18.0, + "learning_rate": 4.980298202726706e-06, + "logits/chosen": 2.3896193504333496, + "logits/rejected": 2.306922435760498, + "logps/chosen": -377.5128479003906, + "logps/rejected": -464.7425842285156, + "loss": 0.566, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3240822553634644, + "rewards/margins": 0.7185450792312622, + "rewards/rejected": -2.0426273345947266, + "step": 1040 + }, + { + "epoch": 0.14, + "grad_norm": 9.5625, + "learning_rate": 4.978841354055148e-06, + "logits/chosen": 1.665701150894165, + "logits/rejected": 2.199453592300415, + "logps/chosen": -363.497314453125, + "logps/rejected": -445.28619384765625, + "loss": 0.5545, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3350220918655396, + "rewards/margins": 0.6997233629226685, + "rewards/rejected": -2.034745216369629, + "step": 1050 + }, + { + "epoch": 0.14, + "grad_norm": 8.5, + "learning_rate": 4.977332774526471e-06, + "logits/chosen": 1.5860077142715454, + "logits/rejected": 2.17120099067688, + "logps/chosen": -387.22027587890625, + "logps/rejected": -447.6641540527344, + "loss": 0.5731, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2284187078475952, + "rewards/margins": 0.6316946148872375, + "rewards/rejected": -1.860113501548767, + "step": 1060 + }, + { + "epoch": 0.14, + "grad_norm": 9.3125, + "learning_rate": 4.97577249562317e-06, + "logits/chosen": 1.3059680461883545, + "logits/rejected": 2.729459047317505, + "logps/chosen": -400.4163513183594, + "logps/rejected": -435.00238037109375, + "loss": 0.5853, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3927881717681885, + "rewards/margins": 0.6526451110839844, + "rewards/rejected": -2.045433282852173, + "step": 1070 + }, + { + "epoch": 0.14, + "grad_norm": 13.6875, + "learning_rate": 4.974160549906652e-06, + "logits/chosen": 1.9924103021621704, + "logits/rejected": 2.650087356567383, + "logps/chosen": -385.2442932128906, + "logps/rejected": -420.45648193359375, + "loss": 0.584, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.272428035736084, + "rewards/margins": 0.5629833340644836, + "rewards/rejected": -1.8354114294052124, + "step": 1080 + }, + { + "epoch": 0.14, + "grad_norm": 8.125, + "learning_rate": 4.972496971016559e-06, + "logits/chosen": 3.1397948265075684, + "logits/rejected": 3.477116107940674, + "logps/chosen": -447.5340881347656, + "logps/rejected": -512.5313110351562, + "loss": 0.4966, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.014864206314087, + "rewards/margins": 0.7934069037437439, + "rewards/rejected": -2.8082709312438965, + "step": 1090 + }, + { + "epoch": 0.14, + "grad_norm": 10.5, + "learning_rate": 4.9707817936700635e-06, + "logits/chosen": 2.5927772521972656, + "logits/rejected": 4.051385879516602, + "logps/chosen": -492.2637634277344, + "logps/rejected": -532.87060546875, + "loss": 0.5898, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3247809410095215, + "rewards/margins": 0.7253618836402893, + "rewards/rejected": -3.050142765045166, + "step": 1100 + }, + { + "epoch": 0.14, + "eval_logits/chosen": 4.413373947143555, + "eval_logits/rejected": 5.307820796966553, + "eval_logps/chosen": -508.3033447265625, + "eval_logps/rejected": -554.2042236328125, + "eval_loss": 0.561853289604187, + "eval_rewards/accuracies": 0.7089999914169312, + "eval_rewards/chosen": -2.4368231296539307, + "eval_rewards/margins": 0.6594300270080566, + "eval_rewards/rejected": -3.0962531566619873, + "eval_runtime": 1593.906, + "eval_samples_per_second": 1.255, + "eval_steps_per_second": 0.314, + "step": 1100 + }, + { + "epoch": 0.15, + "grad_norm": 8.1875, + "learning_rate": 4.969015053661142e-06, + "logits/chosen": 2.426276683807373, + "logits/rejected": 4.329206943511963, + "logps/chosen": -526.8760986328125, + "logps/rejected": -530.1583251953125, + "loss": 0.5581, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.395789384841919, + "rewards/margins": 0.6795339584350586, + "rewards/rejected": -3.0753233432769775, + "step": 1110 + }, + { + "epoch": 0.15, + "grad_norm": 5.4375, + "learning_rate": 4.967196787859835e-06, + "logits/chosen": 2.7714107036590576, + "logits/rejected": 3.8114936351776123, + "logps/chosen": -434.1639099121094, + "logps/rejected": -502.12945556640625, + "loss": 0.5417, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9444061517715454, + "rewards/margins": 0.7909334301948547, + "rewards/rejected": -2.735339403152466, + "step": 1120 + }, + { + "epoch": 0.15, + "grad_norm": 14.0625, + "learning_rate": 4.965327034211469e-06, + "logits/chosen": 1.6808677911758423, + "logits/rejected": 2.370750904083252, + "logps/chosen": -446.2379455566406, + "logps/rejected": -482.42315673828125, + "loss": 0.6304, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5557124614715576, + "rewards/margins": 0.4479561448097229, + "rewards/rejected": -2.003668785095215, + "step": 1130 + }, + { + "epoch": 0.15, + "grad_norm": 8.25, + "learning_rate": 4.96340583173587e-06, + "logits/chosen": 1.121919870376587, + "logits/rejected": 2.6305222511291504, + "logps/chosen": -355.7509460449219, + "logps/rejected": -417.93560791015625, + "loss": 0.4562, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1430405378341675, + "rewards/margins": 0.7580685615539551, + "rewards/rejected": -1.901108980178833, + "step": 1140 + }, + { + "epoch": 0.15, + "grad_norm": 9.9375, + "learning_rate": 4.96143322052655e-06, + "logits/chosen": 2.945784330368042, + "logits/rejected": 3.851064682006836, + "logps/chosen": -396.95355224609375, + "logps/rejected": -517.4216918945312, + "loss": 0.5018, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6020971536636353, + "rewards/margins": 0.9275411367416382, + "rewards/rejected": -2.5296382904052734, + "step": 1150 + }, + { + "epoch": 0.15, + "grad_norm": 9.625, + "learning_rate": 4.959409241749864e-06, + "logits/chosen": 2.93442702293396, + "logits/rejected": 3.921786069869995, + "logps/chosen": -467.01605224609375, + "logps/rejected": -536.6296997070312, + "loss": 0.6158, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.274714708328247, + "rewards/margins": 0.7702791690826416, + "rewards/rejected": -3.0449936389923096, + "step": 1160 + }, + { + "epoch": 0.15, + "grad_norm": 13.4375, + "learning_rate": 4.957333937644159e-06, + "logits/chosen": 2.806804656982422, + "logits/rejected": 3.4476191997528076, + "logps/chosen": -494.03155517578125, + "logps/rejected": -536.6123657226562, + "loss": 0.6034, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2405643463134766, + "rewards/margins": 0.6038447022438049, + "rewards/rejected": -2.844409227371216, + "step": 1170 + }, + { + "epoch": 0.15, + "grad_norm": 7.75, + "learning_rate": 4.955207351518885e-06, + "logits/chosen": 2.9292523860931396, + "logits/rejected": 3.8048958778381348, + "logps/chosen": -519.1878662109375, + "logps/rejected": -557.2689208984375, + "loss": 0.5872, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.184176206588745, + "rewards/margins": 0.6229387521743774, + "rewards/rejected": -2.807114839553833, + "step": 1180 + }, + { + "epoch": 0.16, + "grad_norm": 6.125, + "learning_rate": 4.953029527753699e-06, + "logits/chosen": 2.54266357421875, + "logits/rejected": 3.0706546306610107, + "logps/chosen": -409.0013122558594, + "logps/rejected": -532.3023681640625, + "loss": 0.5531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8078029155731201, + "rewards/margins": 0.9258928298950195, + "rewards/rejected": -2.7336957454681396, + "step": 1190 + }, + { + "epoch": 0.16, + "grad_norm": 18.25, + "learning_rate": 4.95080051179753e-06, + "logits/chosen": 2.138737678527832, + "logits/rejected": 2.217345952987671, + "logps/chosen": -414.7706604003906, + "logps/rejected": -497.00506591796875, + "loss": 0.4782, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3488829135894775, + "rewards/margins": 0.9236246943473816, + "rewards/rejected": -2.272507429122925, + "step": 1200 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 3.148491859436035, + "eval_logits/rejected": 4.01865291595459, + "eval_logps/chosen": -415.2228698730469, + "eval_logps/rejected": -468.4132385253906, + "eval_loss": 0.5594326257705688, + "eval_rewards/accuracies": 0.7089999914169312, + "eval_rewards/chosen": -1.5060184001922607, + "eval_rewards/margins": 0.7323250770568848, + "eval_rewards/rejected": -2.2383434772491455, + "eval_runtime": 1593.535, + "eval_samples_per_second": 1.255, + "eval_steps_per_second": 0.314, + "step": 1200 + }, + { + "epoch": 0.16, + "grad_norm": 13.6875, + "learning_rate": 4.948520350167637e-06, + "logits/chosen": 1.060068130493164, + "logits/rejected": 2.634003162384033, + "logps/chosen": -432.3033142089844, + "logps/rejected": -470.77783203125, + "loss": 0.5074, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6320292949676514, + "rewards/margins": 0.7347411513328552, + "rewards/rejected": -2.3667702674865723, + "step": 1210 + }, + { + "epoch": 0.16, + "grad_norm": 8.5625, + "learning_rate": 4.946189090448639e-06, + "logits/chosen": 1.204738974571228, + "logits/rejected": 2.704601287841797, + "logps/chosen": -428.56695556640625, + "logps/rejected": -472.8544006347656, + "loss": 0.5472, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6184256076812744, + "rewards/margins": 0.8675263524055481, + "rewards/rejected": -2.4859519004821777, + "step": 1220 + }, + { + "epoch": 0.16, + "grad_norm": 7.78125, + "learning_rate": 4.943806781291515e-06, + "logits/chosen": 2.1226108074188232, + "logits/rejected": 3.0666372776031494, + "logps/chosen": -409.69342041015625, + "logps/rejected": -433.8475646972656, + "loss": 0.6601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4817649126052856, + "rewards/margins": 0.5832273960113525, + "rewards/rejected": -2.0649921894073486, + "step": 1230 + }, + { + "epoch": 0.16, + "grad_norm": 22.0, + "learning_rate": 4.941373472412595e-06, + "logits/chosen": 2.166796922683716, + "logits/rejected": 3.9252426624298096, + "logps/chosen": -435.42559814453125, + "logps/rejected": -491.82464599609375, + "loss": 0.5594, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7724716663360596, + "rewards/margins": 0.9388161897659302, + "rewards/rejected": -2.7112877368927, + "step": 1240 + }, + { + "epoch": 0.16, + "grad_norm": 7.5625, + "learning_rate": 4.938889214592521e-06, + "logits/chosen": 2.505998373031616, + "logits/rejected": 3.1269757747650146, + "logps/chosen": -388.64581298828125, + "logps/rejected": -475.17303466796875, + "loss": 0.4631, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7814483642578125, + "rewards/margins": 1.0449895858764648, + "rewards/rejected": -2.8264379501342773, + "step": 1250 + }, + { + "epoch": 0.16, + "grad_norm": 13.5625, + "learning_rate": 4.936354059675186e-06, + "logits/chosen": 0.6287264823913574, + "logits/rejected": 1.3099499940872192, + "logps/chosen": -386.27423095703125, + "logps/rejected": -423.3670959472656, + "loss": 0.6161, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.42852783203125, + "rewards/margins": 0.5442193150520325, + "rewards/rejected": -1.9727470874786377, + "step": 1260 + }, + { + "epoch": 0.17, + "grad_norm": 8.6875, + "learning_rate": 4.933768060566654e-06, + "logits/chosen": -0.03582334518432617, + "logits/rejected": 1.0751930475234985, + "logps/chosen": -376.5238952636719, + "logps/rejected": -389.35760498046875, + "loss": 0.5561, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0833394527435303, + "rewards/margins": 0.5493221879005432, + "rewards/rejected": -1.6326615810394287, + "step": 1270 + }, + { + "epoch": 0.17, + "grad_norm": 7.71875, + "learning_rate": 4.931131271234052e-06, + "logits/chosen": 0.22822928428649902, + "logits/rejected": 1.3094651699066162, + "logps/chosen": -337.14202880859375, + "logps/rejected": -388.39044189453125, + "loss": 0.5507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9693458676338196, + "rewards/margins": 0.6725118160247803, + "rewards/rejected": -1.6418577432632446, + "step": 1280 + }, + { + "epoch": 0.17, + "grad_norm": 16.125, + "learning_rate": 4.928443746704448e-06, + "logits/chosen": 1.678131341934204, + "logits/rejected": 2.727170944213867, + "logps/chosen": -398.09490966796875, + "logps/rejected": -431.4143981933594, + "loss": 0.6014, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.361238956451416, + "rewards/margins": 0.689416766166687, + "rewards/rejected": -2.0506556034088135, + "step": 1290 + }, + { + "epoch": 0.17, + "grad_norm": 8.75, + "learning_rate": 4.925705543063703e-06, + "logits/chosen": 1.2936311960220337, + "logits/rejected": 2.5942752361297607, + "logps/chosen": -441.23931884765625, + "logps/rejected": -458.55279541015625, + "loss": 0.5709, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5207133293151855, + "rewards/margins": 0.6074548959732056, + "rewards/rejected": -2.1281678676605225, + "step": 1300 + }, + { + "epoch": 0.17, + "eval_logits/chosen": 3.2569968700408936, + "eval_logits/rejected": 4.131510257720947, + "eval_logps/chosen": -437.77825927734375, + "eval_logps/rejected": -481.25823974609375, + "eval_loss": 0.5480995774269104, + "eval_rewards/accuracies": 0.7245000004768372, + "eval_rewards/chosen": -1.7315717935562134, + "eval_rewards/margins": 0.6352214813232422, + "eval_rewards/rejected": -2.366793394088745, + "eval_runtime": 1593.4224, + "eval_samples_per_second": 1.255, + "eval_steps_per_second": 0.314, + "step": 1300 + }, + { + "epoch": 0.17, + "grad_norm": 10.75, + "learning_rate": 4.922916717455297e-06, + "logits/chosen": 2.208543300628662, + "logits/rejected": 3.476393938064575, + "logps/chosen": -419.43719482421875, + "logps/rejected": -469.15582275390625, + "loss": 0.4688, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7308155298233032, + "rewards/margins": 0.9251171946525574, + "rewards/rejected": -2.655932903289795, + "step": 1310 + }, + { + "epoch": 0.17, + "grad_norm": 14.125, + "learning_rate": 4.920077328079136e-06, + "logits/chosen": 0.9731375575065613, + "logits/rejected": 1.9899227619171143, + "logps/chosen": -395.8177795410156, + "logps/rejected": -478.02020263671875, + "loss": 0.5184, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.540832757949829, + "rewards/margins": 0.8474717140197754, + "rewards/rejected": -2.3883044719696045, + "step": 1320 + }, + { + "epoch": 0.17, + "grad_norm": 10.6875, + "learning_rate": 4.9171874341903445e-06, + "logits/chosen": 0.6347458362579346, + "logits/rejected": 1.3302295207977295, + "logps/chosen": -389.74908447265625, + "logps/rejected": -467.36187744140625, + "loss": 0.5382, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3787751197814941, + "rewards/margins": 0.6735467910766602, + "rewards/rejected": -2.0523219108581543, + "step": 1330 + }, + { + "epoch": 0.18, + "grad_norm": 10.625, + "learning_rate": 4.914247096098019e-06, + "logits/chosen": 0.6093196272850037, + "logits/rejected": 1.4135183095932007, + "logps/chosen": -438.0009765625, + "logps/rejected": -489.31610107421875, + "loss": 0.5539, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.742841124534607, + "rewards/margins": 0.6880618333816528, + "rewards/rejected": -2.4309029579162598, + "step": 1340 + }, + { + "epoch": 0.18, + "grad_norm": 17.0, + "learning_rate": 4.911256375163977e-06, + "logits/chosen": 2.0072357654571533, + "logits/rejected": 1.9592632055282593, + "logps/chosen": -486.7958984375, + "logps/rejected": -544.7276611328125, + "loss": 0.6466, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.335031270980835, + "rewards/margins": 0.4429550766944885, + "rewards/rejected": -2.7779860496520996, + "step": 1350 + }, + { + "epoch": 0.18, + "grad_norm": 10.25, + "learning_rate": 4.908215333801474e-06, + "logits/chosen": 2.0604007244110107, + "logits/rejected": 2.6346802711486816, + "logps/chosen": -434.343017578125, + "logps/rejected": -519.6720581054688, + "loss": 0.5155, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1249516010284424, + "rewards/margins": 0.7996464967727661, + "rewards/rejected": -2.92459774017334, + "step": 1360 + }, + { + "epoch": 0.18, + "grad_norm": 8.9375, + "learning_rate": 4.9051240354739004e-06, + "logits/chosen": 1.6247851848602295, + "logits/rejected": 2.725818157196045, + "logps/chosen": -488.3594665527344, + "logps/rejected": -547.042724609375, + "loss": 0.4948, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.056096315383911, + "rewards/margins": 0.8274715542793274, + "rewards/rejected": -2.8835678100585938, + "step": 1370 + }, + { + "epoch": 0.18, + "grad_norm": 17.0, + "learning_rate": 4.901982544693457e-06, + "logits/chosen": 2.3090920448303223, + "logits/rejected": 3.1643126010894775, + "logps/chosen": -486.04339599609375, + "logps/rejected": -573.066650390625, + "loss": 0.4762, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1145107746124268, + "rewards/margins": 1.0138976573944092, + "rewards/rejected": -3.128408432006836, + "step": 1380 + }, + { + "epoch": 0.18, + "grad_norm": 14.0, + "learning_rate": 4.898790927019809e-06, + "logits/chosen": 2.4458489418029785, + "logits/rejected": 3.7828209400177, + "logps/chosen": -495.18939208984375, + "logps/rejected": -551.55126953125, + "loss": 0.5416, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4461328983306885, + "rewards/margins": 0.7788988351821899, + "rewards/rejected": -3.225032091140747, + "step": 1390 + }, + { + "epoch": 0.18, + "grad_norm": 7.71875, + "learning_rate": 4.895549249058718e-06, + "logits/chosen": 2.8335728645324707, + "logits/rejected": 3.7893567085266113, + "logps/chosen": -523.3660888671875, + "logps/rejected": -584.7088623046875, + "loss": 0.5181, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.668732166290283, + "rewards/margins": 0.8389676213264465, + "rewards/rejected": -3.507699966430664, + "step": 1400 + }, + { + "epoch": 0.18, + "eval_logits/chosen": 3.6943774223327637, + "eval_logits/rejected": 4.697685241699219, + "eval_logps/chosen": -513.1900024414062, + "eval_logps/rejected": -583.56396484375, + "eval_loss": 0.5454376935958862, + "eval_rewards/accuracies": 0.7139999866485596, + "eval_rewards/chosen": -2.4856903553009033, + "eval_rewards/margins": 0.904159665107727, + "eval_rewards/rejected": -3.389849901199341, + "eval_runtime": 1592.6989, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 1400 + }, + { + "epoch": 0.18, + "grad_norm": 8.6875, + "learning_rate": 4.892257578460656e-06, + "logits/chosen": 1.9299166202545166, + "logits/rejected": 2.9093315601348877, + "logps/chosen": -498.1167907714844, + "logps/rejected": -564.88330078125, + "loss": 0.6278, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.382009983062744, + "rewards/margins": 0.6775062680244446, + "rewards/rejected": -3.059515953063965, + "step": 1410 + }, + { + "epoch": 0.19, + "grad_norm": 9.3125, + "learning_rate": 4.888915983919383e-06, + "logits/chosen": 1.348301649093628, + "logits/rejected": 2.4204530715942383, + "logps/chosen": -446.24456787109375, + "logps/rejected": -506.3639221191406, + "loss": 0.5718, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0002195835113525, + "rewards/margins": 0.7560812830924988, + "rewards/rejected": -2.756300926208496, + "step": 1420 + }, + { + "epoch": 0.19, + "grad_norm": 5.8125, + "learning_rate": 4.885524535170525e-06, + "logits/chosen": 1.6420373916625977, + "logits/rejected": 2.3852953910827637, + "logps/chosen": -449.477294921875, + "logps/rejected": -495.0896911621094, + "loss": 0.5018, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8907318115234375, + "rewards/margins": 0.7322606444358826, + "rewards/rejected": -2.622992515563965, + "step": 1430 + }, + { + "epoch": 0.19, + "grad_norm": 8.625, + "learning_rate": 4.882083302990113e-06, + "logits/chosen": 1.841507911682129, + "logits/rejected": 2.7790274620056152, + "logps/chosen": -488.05938720703125, + "logps/rejected": -542.5457763671875, + "loss": 0.5289, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.215773105621338, + "rewards/margins": 0.7926411032676697, + "rewards/rejected": -3.0084145069122314, + "step": 1440 + }, + { + "epoch": 0.19, + "grad_norm": 6.84375, + "learning_rate": 4.878592359193104e-06, + "logits/chosen": 2.2885186672210693, + "logits/rejected": 2.9249768257141113, + "logps/chosen": -469.7869567871094, + "logps/rejected": -578.7250366210938, + "loss": 0.5524, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.663689136505127, + "rewards/margins": 0.9176554679870605, + "rewards/rejected": -3.5813446044921875, + "step": 1450 + }, + { + "epoch": 0.19, + "grad_norm": 10.0, + "learning_rate": 4.875051776631888e-06, + "logits/chosen": 2.708163022994995, + "logits/rejected": 3.2748348712921143, + "logps/chosen": -531.5555419921875, + "logps/rejected": -622.9147338867188, + "loss": 0.528, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9948220252990723, + "rewards/margins": 0.846728503704071, + "rewards/rejected": -3.84155011177063, + "step": 1460 + }, + { + "epoch": 0.19, + "grad_norm": 8.375, + "learning_rate": 4.871461629194764e-06, + "logits/chosen": 2.8306050300598145, + "logits/rejected": 4.097413539886475, + "logps/chosen": -667.3284912109375, + "logps/rejected": -681.8602905273438, + "loss": 0.6414, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.854893922805786, + "rewards/margins": 0.5547782778739929, + "rewards/rejected": -4.409672737121582, + "step": 1470 + }, + { + "epoch": 0.19, + "grad_norm": 10.625, + "learning_rate": 4.8678219918043984e-06, + "logits/chosen": 2.2125823497772217, + "logits/rejected": 2.6229119300842285, + "logps/chosen": -522.2234497070312, + "logps/rejected": -616.5860595703125, + "loss": 0.5614, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.176955461502075, + "rewards/margins": 0.8344646692276001, + "rewards/rejected": -4.011419773101807, + "step": 1480 + }, + { + "epoch": 0.19, + "grad_norm": 10.5, + "learning_rate": 4.864132940416262e-06, + "logits/chosen": 1.9612172842025757, + "logits/rejected": 3.014679431915283, + "logps/chosen": -548.0928344726562, + "logps/rejected": -592.0524291992188, + "loss": 0.5564, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.925997495651245, + "rewards/margins": 0.7844290137290955, + "rewards/rejected": -3.710425853729248, + "step": 1490 + }, + { + "epoch": 0.2, + "grad_norm": 14.375, + "learning_rate": 4.860394552017044e-06, + "logits/chosen": 2.663686990737915, + "logits/rejected": 3.601266384124756, + "logps/chosen": -519.1183471679688, + "logps/rejected": -590.6419067382812, + "loss": 0.5495, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.768385887145996, + "rewards/margins": 0.7279509902000427, + "rewards/rejected": -3.4963364601135254, + "step": 1500 + }, + { + "epoch": 0.2, + "eval_logits/chosen": 3.288820266723633, + "eval_logits/rejected": 4.1846513748168945, + "eval_logps/chosen": -520.6432495117188, + "eval_logps/rejected": -580.3214721679688, + "eval_loss": 0.5428246855735779, + "eval_rewards/accuracies": 0.7204999923706055, + "eval_rewards/chosen": -2.5602221488952637, + "eval_rewards/margins": 0.797203540802002, + "eval_rewards/rejected": -3.3574254512786865, + "eval_runtime": 1592.2923, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 1500 + }, + { + "epoch": 0.2, + "grad_norm": 7.96875, + "learning_rate": 4.856606904623047e-06, + "logits/chosen": 1.537600040435791, + "logits/rejected": 2.585251569747925, + "logps/chosen": -543.2179565429688, + "logps/rejected": -580.5303344726562, + "loss": 0.5442, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.247091293334961, + "rewards/margins": 0.7817636728286743, + "rewards/rejected": -3.0288548469543457, + "step": 1510 + }, + { + "epoch": 0.2, + "grad_norm": 12.8125, + "learning_rate": 4.852770077278557e-06, + "logits/chosen": 1.9782816171646118, + "logits/rejected": 2.7694122791290283, + "logps/chosen": -489.9452209472656, + "logps/rejected": -541.9598388671875, + "loss": 0.4728, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1113080978393555, + "rewards/margins": 0.8755633234977722, + "rewards/rejected": -2.9868717193603516, + "step": 1520 + }, + { + "epoch": 0.2, + "grad_norm": 6.71875, + "learning_rate": 4.848884150054196e-06, + "logits/chosen": 2.329437732696533, + "logits/rejected": 2.7830517292022705, + "logps/chosen": -549.2725219726562, + "logps/rejected": -610.2501220703125, + "loss": 0.5669, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0749659538269043, + "rewards/margins": 0.8318060636520386, + "rewards/rejected": -3.9067721366882324, + "step": 1530 + }, + { + "epoch": 0.2, + "grad_norm": 12.6875, + "learning_rate": 4.8449492040452495e-06, + "logits/chosen": 2.3675220012664795, + "logits/rejected": 3.3951308727264404, + "logps/chosen": -585.8651733398438, + "logps/rejected": -600.3690795898438, + "loss": 0.6549, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.3787026405334473, + "rewards/margins": 0.6356854438781738, + "rewards/rejected": -4.014388084411621, + "step": 1540 + }, + { + "epoch": 0.2, + "grad_norm": 15.125, + "learning_rate": 4.840965321369973e-06, + "logits/chosen": 2.4867424964904785, + "logits/rejected": 3.2565741539001465, + "logps/chosen": -635.0973510742188, + "logps/rejected": -656.8734130859375, + "loss": 0.6027, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.280498504638672, + "rewards/margins": 0.65467768907547, + "rewards/rejected": -3.935176134109497, + "step": 1550 + }, + { + "epoch": 0.2, + "grad_norm": 18.625, + "learning_rate": 4.8369325851678795e-06, + "logits/chosen": 2.721409559249878, + "logits/rejected": 3.499811887741089, + "logps/chosen": -575.1268310546875, + "logps/rejected": -651.55224609375, + "loss": 0.5457, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1099348068237305, + "rewards/margins": 0.82475346326828, + "rewards/rejected": -3.934687852859497, + "step": 1560 + }, + { + "epoch": 0.21, + "grad_norm": 8.8125, + "learning_rate": 4.832851079598007e-06, + "logits/chosen": 3.0259532928466797, + "logits/rejected": 3.415708065032959, + "logps/chosen": -615.1463623046875, + "logps/rejected": -673.1775512695312, + "loss": 0.5671, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.413778305053711, + "rewards/margins": 0.6115729212760925, + "rewards/rejected": -4.025351047515869, + "step": 1570 + }, + { + "epoch": 0.21, + "grad_norm": 17.125, + "learning_rate": 4.828720889837158e-06, + "logits/chosen": 2.180393695831299, + "logits/rejected": 3.560591459274292, + "logps/chosen": -597.0384521484375, + "logps/rejected": -648.1893310546875, + "loss": 0.4638, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2695510387420654, + "rewards/margins": 1.0302797555923462, + "rewards/rejected": -4.299830436706543, + "step": 1580 + }, + { + "epoch": 0.21, + "grad_norm": 16.125, + "learning_rate": 4.824542102078125e-06, + "logits/chosen": 2.5585384368896484, + "logits/rejected": 3.3628134727478027, + "logps/chosen": -559.4739379882812, + "logps/rejected": -660.89013671875, + "loss": 0.4251, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.904505491256714, + "rewards/margins": 1.1467950344085693, + "rewards/rejected": -4.051300525665283, + "step": 1590 + }, + { + "epoch": 0.21, + "grad_norm": 8.5625, + "learning_rate": 4.820314803527888e-06, + "logits/chosen": 2.209808588027954, + "logits/rejected": 2.755720853805542, + "logps/chosen": -527.7376708984375, + "logps/rejected": -611.8505249023438, + "loss": 0.574, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.922621011734009, + "rewards/margins": 0.8603001832962036, + "rewards/rejected": -3.782921314239502, + "step": 1600 + }, + { + "epoch": 0.21, + "eval_logits/chosen": 3.930431842803955, + "eval_logits/rejected": 4.921872138977051, + "eval_logps/chosen": -535.627685546875, + "eval_logps/rejected": -599.0427856445312, + "eval_loss": 0.5638437271118164, + "eval_rewards/accuracies": 0.718999981880188, + "eval_rewards/chosen": -2.710066795349121, + "eval_rewards/margins": 0.8345724940299988, + "eval_rewards/rejected": -3.5446391105651855, + "eval_runtime": 1592.9451, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 1600 + }, + { + "epoch": 0.21, + "grad_norm": 15.75, + "learning_rate": 4.816039082405799e-06, + "logits/chosen": 2.2764475345611572, + "logits/rejected": 2.266634941101074, + "logps/chosen": -485.3955993652344, + "logps/rejected": -614.9955444335938, + "loss": 0.5072, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.3901500701904297, + "rewards/margins": 0.995215117931366, + "rewards/rejected": -3.3853652477264404, + "step": 1610 + }, + { + "epoch": 0.21, + "grad_norm": 7.59375, + "learning_rate": 4.81171502794174e-06, + "logits/chosen": 2.212632417678833, + "logits/rejected": 3.119572401046753, + "logps/chosen": -431.06182861328125, + "logps/rejected": -525.5626220703125, + "loss": 0.5715, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3643248081207275, + "rewards/margins": 0.8816196322441101, + "rewards/rejected": -3.2459442615509033, + "step": 1620 + }, + { + "epoch": 0.21, + "grad_norm": 5.96875, + "learning_rate": 4.8073427303742584e-06, + "logits/chosen": 2.651143789291382, + "logits/rejected": 3.788827896118164, + "logps/chosen": -436.7945861816406, + "logps/rejected": -547.8876953125, + "loss": 0.5112, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.291949510574341, + "rewards/margins": 1.0054199695587158, + "rewards/rejected": -3.2973690032958984, + "step": 1630 + }, + { + "epoch": 0.21, + "grad_norm": 17.625, + "learning_rate": 4.802922280948685e-06, + "logits/chosen": 2.9048256874084473, + "logits/rejected": 2.298372507095337, + "logps/chosen": -445.8115234375, + "logps/rejected": -583.6910400390625, + "loss": 0.5133, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4814229011535645, + "rewards/margins": 0.9504474401473999, + "rewards/rejected": -3.431870698928833, + "step": 1640 + }, + { + "epoch": 0.22, + "grad_norm": 9.75, + "learning_rate": 4.798453771915231e-06, + "logits/chosen": 2.131483316421509, + "logits/rejected": 2.286714553833008, + "logps/chosen": -550.1790161132812, + "logps/rejected": -614.5077514648438, + "loss": 0.5829, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8355047702789307, + "rewards/margins": 0.6407086849212646, + "rewards/rejected": -3.4762134552001953, + "step": 1650 + }, + { + "epoch": 0.22, + "grad_norm": 10.0625, + "learning_rate": 4.793937296527062e-06, + "logits/chosen": 2.0528528690338135, + "logits/rejected": 3.4341843128204346, + "logps/chosen": -583.1158447265625, + "logps/rejected": -599.8079223632812, + "loss": 0.7019, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.1697096824645996, + "rewards/margins": 0.5639287233352661, + "rewards/rejected": -3.7336387634277344, + "step": 1660 + }, + { + "epoch": 0.22, + "grad_norm": 6.5625, + "learning_rate": 4.78937294903835e-06, + "logits/chosen": 1.3232476711273193, + "logits/rejected": 3.333897829055786, + "logps/chosen": -524.6771240234375, + "logps/rejected": -585.8258056640625, + "loss": 0.4743, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5064637660980225, + "rewards/margins": 1.062355875968933, + "rewards/rejected": -3.568819522857666, + "step": 1670 + }, + { + "epoch": 0.22, + "grad_norm": 12.1875, + "learning_rate": 4.78476082470231e-06, + "logits/chosen": 1.2536879777908325, + "logits/rejected": 2.9530956745147705, + "logps/chosen": -561.2202758789062, + "logps/rejected": -656.8157958984375, + "loss": 0.4279, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7209866046905518, + "rewards/margins": 1.2556803226470947, + "rewards/rejected": -3.9766669273376465, + "step": 1680 + }, + { + "epoch": 0.22, + "grad_norm": 8.375, + "learning_rate": 4.780101019769212e-06, + "logits/chosen": 2.281708240509033, + "logits/rejected": 3.858910322189331, + "logps/chosen": -669.6038818359375, + "logps/rejected": -732.0891723632812, + "loss": 0.6202, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8110530376434326, + "rewards/margins": 0.9638622403144836, + "rewards/rejected": -4.7749152183532715, + "step": 1690 + }, + { + "epoch": 0.22, + "grad_norm": 8.625, + "learning_rate": 4.775393631484368e-06, + "logits/chosen": 1.3264646530151367, + "logits/rejected": 3.5289459228515625, + "logps/chosen": -596.949462890625, + "logps/rejected": -665.1720581054688, + "loss": 0.4901, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0804290771484375, + "rewards/margins": 1.0506683588027954, + "rewards/rejected": -4.131097316741943, + "step": 1700 + }, + { + "epoch": 0.22, + "eval_logits/chosen": 2.930548667907715, + "eval_logits/rejected": 3.8220255374908447, + "eval_logps/chosen": -513.6201171875, + "eval_logps/rejected": -580.3493041992188, + "eval_loss": 0.528390109539032, + "eval_rewards/accuracies": 0.7335000038146973, + "eval_rewards/chosen": -2.4899909496307373, + "eval_rewards/margins": 0.8677131533622742, + "eval_rewards/rejected": -3.3577041625976562, + "eval_runtime": 1592.0798, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 1700 + }, + { + "epoch": 0.22, + "grad_norm": 5.625, + "learning_rate": 4.770638758086105e-06, + "logits/chosen": 1.39579176902771, + "logits/rejected": 1.9867902994155884, + "logps/chosen": -454.69696044921875, + "logps/rejected": -540.9384765625, + "loss": 0.5284, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9679079055786133, + "rewards/margins": 0.7821384072303772, + "rewards/rejected": -2.7500462532043457, + "step": 1710 + }, + { + "epoch": 0.23, + "grad_norm": 15.625, + "learning_rate": 4.7658364988037184e-06, + "logits/chosen": 0.4757395386695862, + "logits/rejected": 1.7275594472885132, + "logps/chosen": -420.74237060546875, + "logps/rejected": -450.6329040527344, + "loss": 0.5074, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5048556327819824, + "rewards/margins": 0.7989991903305054, + "rewards/rejected": -2.3038547039031982, + "step": 1720 + }, + { + "epoch": 0.23, + "grad_norm": 19.0, + "learning_rate": 4.760986953855395e-06, + "logits/chosen": 0.5297383069992065, + "logits/rejected": 1.4827096462249756, + "logps/chosen": -479.01263427734375, + "logps/rejected": -472.995849609375, + "loss": 0.5399, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6811622381210327, + "rewards/margins": 0.7285897135734558, + "rewards/rejected": -2.4097516536712646, + "step": 1730 + }, + { + "epoch": 0.23, + "grad_norm": 17.875, + "learning_rate": 4.756090224446127e-06, + "logits/chosen": 1.6546382904052734, + "logits/rejected": 2.8603973388671875, + "logps/chosen": -511.18109130859375, + "logps/rejected": -617.1567993164062, + "loss": 0.4234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.4521613121032715, + "rewards/margins": 1.1986668109893799, + "rewards/rejected": -3.6508285999298096, + "step": 1740 + }, + { + "epoch": 0.23, + "grad_norm": 15.9375, + "learning_rate": 4.7511464127655945e-06, + "logits/chosen": 2.412151575088501, + "logits/rejected": 2.7965240478515625, + "logps/chosen": -564.255859375, + "logps/rejected": -674.6925048828125, + "loss": 0.5344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.802210569381714, + "rewards/margins": 1.1913468837738037, + "rewards/rejected": -3.9935576915740967, + "step": 1750 + }, + { + "epoch": 0.23, + "grad_norm": 19.25, + "learning_rate": 4.74615562198604e-06, + "logits/chosen": 2.0918798446655273, + "logits/rejected": 3.590376615524292, + "logps/chosen": -531.4205322265625, + "logps/rejected": -628.2448120117188, + "loss": 0.5798, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.993215322494507, + "rewards/margins": 1.028847336769104, + "rewards/rejected": -4.0220627784729, + "step": 1760 + }, + { + "epoch": 0.23, + "grad_norm": 7.84375, + "learning_rate": 4.741117956260107e-06, + "logits/chosen": 1.7307329177856445, + "logits/rejected": 2.0027899742126465, + "logps/chosen": -564.1361083984375, + "logps/rejected": -621.1322021484375, + "loss": 0.5516, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.6234326362609863, + "rewards/margins": 0.8255942463874817, + "rewards/rejected": -3.4490268230438232, + "step": 1770 + }, + { + "epoch": 0.23, + "grad_norm": 9.8125, + "learning_rate": 4.736033520718672e-06, + "logits/chosen": 0.6438279151916504, + "logits/rejected": 1.2121880054473877, + "logps/chosen": -454.52545166015625, + "logps/rejected": -499.5397033691406, + "loss": 0.5871, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9997762441635132, + "rewards/margins": 0.5949224233627319, + "rewards/rejected": -2.594698667526245, + "step": 1780 + }, + { + "epoch": 0.23, + "grad_norm": 15.125, + "learning_rate": 4.730902421468652e-06, + "logits/chosen": 0.7901066541671753, + "logits/rejected": 0.7325730919837952, + "logps/chosen": -462.3359375, + "logps/rejected": -552.4173583984375, + "loss": 0.5828, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9955250024795532, + "rewards/margins": 0.6368887424468994, + "rewards/rejected": -2.632413864135742, + "step": 1790 + }, + { + "epoch": 0.24, + "grad_norm": 6.65625, + "learning_rate": 4.7257247655907854e-06, + "logits/chosen": 0.637048602104187, + "logits/rejected": 1.482924461364746, + "logps/chosen": -407.1905822753906, + "logps/rejected": -496.34161376953125, + "loss": 0.5149, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.690338134765625, + "rewards/margins": 0.9209977984428406, + "rewards/rejected": -2.6113357543945312, + "step": 1800 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.2750667333602905, + "eval_logits/rejected": 2.0261995792388916, + "eval_logps/chosen": -439.68988037109375, + "eval_logps/rejected": -491.2047424316406, + "eval_loss": 0.5408201813697815, + "eval_rewards/accuracies": 0.7214999794960022, + "eval_rewards/chosen": -1.7506884336471558, + "eval_rewards/margins": 0.7155702114105225, + "eval_rewards/rejected": -2.4662585258483887, + "eval_runtime": 1592.6919, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 1800 + }, + { + "epoch": 0.24, + "grad_norm": 12.25, + "learning_rate": 4.720500661137397e-06, + "logits/chosen": 0.7021459341049194, + "logits/rejected": 1.6718536615371704, + "logps/chosen": -470.3685607910156, + "logps/rejected": -503.8094787597656, + "loss": 0.5659, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9385855197906494, + "rewards/margins": 0.6853693127632141, + "rewards/rejected": -2.6239547729492188, + "step": 1810 + }, + { + "epoch": 0.24, + "grad_norm": 15.5625, + "learning_rate": 4.71523021713015e-06, + "logits/chosen": 0.9356092214584351, + "logits/rejected": 1.5934399366378784, + "logps/chosen": -463.17303466796875, + "logps/rejected": -509.04864501953125, + "loss": 0.6068, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.297934055328369, + "rewards/margins": 0.5046521425247192, + "rewards/rejected": -2.802586078643799, + "step": 1820 + }, + { + "epoch": 0.24, + "grad_norm": 18.25, + "learning_rate": 4.709913543557761e-06, + "logits/chosen": 1.4963045120239258, + "logits/rejected": 1.9225801229476929, + "logps/chosen": -482.92852783203125, + "logps/rejected": -578.1611938476562, + "loss": 0.5275, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.498018980026245, + "rewards/margins": 1.007016897201538, + "rewards/rejected": -3.505035877227783, + "step": 1830 + }, + { + "epoch": 0.24, + "grad_norm": 9.0625, + "learning_rate": 4.704550751373715e-06, + "logits/chosen": 1.148832082748413, + "logits/rejected": 1.9787461757659912, + "logps/chosen": -577.2870483398438, + "logps/rejected": -619.6170043945312, + "loss": 0.6045, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.957585096359253, + "rewards/margins": 0.6815073490142822, + "rewards/rejected": -3.639092206954956, + "step": 1840 + }, + { + "epoch": 0.24, + "grad_norm": 4.5, + "learning_rate": 4.699141952493941e-06, + "logits/chosen": 1.7764909267425537, + "logits/rejected": 2.5067789554595947, + "logps/chosen": -537.6290893554688, + "logps/rejected": -584.9334106445312, + "loss": 0.5237, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.823072910308838, + "rewards/margins": 0.7261917591094971, + "rewards/rejected": -3.549264907836914, + "step": 1850 + }, + { + "epoch": 0.24, + "grad_norm": 18.625, + "learning_rate": 4.6936872597944814e-06, + "logits/chosen": 1.7310603857040405, + "logits/rejected": 2.559417247772217, + "logps/chosen": -464.6128845214844, + "logps/rejected": -559.8570556640625, + "loss": 0.4748, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3967392444610596, + "rewards/margins": 0.9208498001098633, + "rewards/rejected": -3.317589282989502, + "step": 1860 + }, + { + "epoch": 0.24, + "grad_norm": 20.25, + "learning_rate": 4.688186787109136e-06, + "logits/chosen": 1.956180214881897, + "logits/rejected": 3.564734935760498, + "logps/chosen": -528.5537109375, + "logps/rejected": -586.9791870117188, + "loss": 0.5301, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.75168514251709, + "rewards/margins": 0.8507223129272461, + "rewards/rejected": -3.602407455444336, + "step": 1870 + }, + { + "epoch": 0.25, + "grad_norm": 8.3125, + "learning_rate": 4.682640649227085e-06, + "logits/chosen": 1.8421905040740967, + "logits/rejected": 3.96189546585083, + "logps/chosen": -570.6896362304688, + "logps/rejected": -673.6886596679688, + "loss": 0.4183, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9929141998291016, + "rewards/margins": 1.4335700273513794, + "rewards/rejected": -4.42648458480835, + "step": 1880 + }, + { + "epoch": 0.25, + "grad_norm": 26.625, + "learning_rate": 4.677048961890492e-06, + "logits/chosen": 2.172407627105713, + "logits/rejected": 2.843996524810791, + "logps/chosen": -598.6932373046875, + "logps/rejected": -748.4923095703125, + "loss": 0.5991, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.3650193214416504, + "rewards/margins": 1.2889230251312256, + "rewards/rejected": -4.653942108154297, + "step": 1890 + }, + { + "epoch": 0.25, + "grad_norm": 25.375, + "learning_rate": 4.671411841792096e-06, + "logits/chosen": 1.5214734077453613, + "logits/rejected": 1.8940036296844482, + "logps/chosen": -554.8588256835938, + "logps/rejected": -598.6972045898438, + "loss": 0.6382, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.618443012237549, + "rewards/margins": 0.6023415327072144, + "rewards/rejected": -3.2207846641540527, + "step": 1900 + }, + { + "epoch": 0.25, + "eval_logits/chosen": 1.4989570379257202, + "eval_logits/rejected": 2.4038922786712646, + "eval_logps/chosen": -477.3052062988281, + "eval_logps/rejected": -540.05419921875, + "eval_loss": 0.5325160622596741, + "eval_rewards/accuracies": 0.7254999876022339, + "eval_rewards/chosen": -2.1268417835235596, + "eval_rewards/margins": 0.8279104232788086, + "eval_rewards/rejected": -2.954752206802368, + "eval_runtime": 1591.5791, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 1900 + }, + { + "epoch": 0.25, + "grad_norm": 11.5, + "learning_rate": 4.665729406572764e-06, + "logits/chosen": -0.22512850165367126, + "logits/rejected": 0.8694722056388855, + "logps/chosen": -463.45330810546875, + "logps/rejected": -509.54461669921875, + "loss": 0.5328, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8640069961547852, + "rewards/margins": 0.756619930267334, + "rewards/rejected": -2.620626926422119, + "step": 1910 + }, + { + "epoch": 0.25, + "grad_norm": 11.25, + "learning_rate": 4.660001774819048e-06, + "logits/chosen": 0.6744810342788696, + "logits/rejected": 2.1456856727600098, + "logps/chosen": -497.3128967285156, + "logps/rejected": -557.4090576171875, + "loss": 0.4914, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0291192531585693, + "rewards/margins": 0.9803076982498169, + "rewards/rejected": -3.0094268321990967, + "step": 1920 + }, + { + "epoch": 0.25, + "grad_norm": 7.71875, + "learning_rate": 4.654229066060702e-06, + "logits/chosen": 2.223780870437622, + "logits/rejected": 2.437018871307373, + "logps/chosen": -439.65240478515625, + "logps/rejected": -563.2244873046875, + "loss": 0.5364, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4993643760681152, + "rewards/margins": 0.9292726516723633, + "rewards/rejected": -3.4286370277404785, + "step": 1930 + }, + { + "epoch": 0.25, + "grad_norm": 9.3125, + "learning_rate": 4.648411400768193e-06, + "logits/chosen": 1.3439040184020996, + "logits/rejected": 2.485945463180542, + "logps/chosen": -523.4782104492188, + "logps/rejected": -575.5657958984375, + "loss": 0.5693, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6590380668640137, + "rewards/margins": 0.8810779452323914, + "rewards/rejected": -3.54011607170105, + "step": 1940 + }, + { + "epoch": 0.26, + "grad_norm": 8.9375, + "learning_rate": 4.642548900350182e-06, + "logits/chosen": 1.0268100500106812, + "logits/rejected": 1.1440961360931396, + "logps/chosen": -480.35028076171875, + "logps/rejected": -516.804443359375, + "loss": 0.6716, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.193833827972412, + "rewards/margins": 0.443396657705307, + "rewards/rejected": -2.637230396270752, + "step": 1950 + }, + { + "epoch": 0.26, + "grad_norm": 12.0, + "learning_rate": 4.636641687150994e-06, + "logits/chosen": -0.1693330854177475, + "logits/rejected": 0.4488976001739502, + "logps/chosen": -465.98529052734375, + "logps/rejected": -505.60015869140625, + "loss": 0.5583, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8904263973236084, + "rewards/margins": 0.6098214983940125, + "rewards/rejected": -2.5002479553222656, + "step": 1960 + }, + { + "epoch": 0.26, + "grad_norm": 12.0625, + "learning_rate": 4.6306898844480615e-06, + "logits/chosen": 0.20104345679283142, + "logits/rejected": 1.6518051624298096, + "logps/chosen": -539.2623901367188, + "logps/rejected": -560.7965698242188, + "loss": 0.4878, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1119306087493896, + "rewards/margins": 0.8724871873855591, + "rewards/rejected": -2.984417676925659, + "step": 1970 + }, + { + "epoch": 0.26, + "grad_norm": 8.75, + "learning_rate": 4.624693616449358e-06, + "logits/chosen": 0.3932510018348694, + "logits/rejected": 1.3235846757888794, + "logps/chosen": -443.524658203125, + "logps/rejected": -467.9271545410156, + "loss": 0.5635, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0940842628479004, + "rewards/margins": 0.68193119764328, + "rewards/rejected": -2.7760157585144043, + "step": 1980 + }, + { + "epoch": 0.26, + "grad_norm": 15.625, + "learning_rate": 4.6186530082908e-06, + "logits/chosen": 0.6695358157157898, + "logits/rejected": 0.8175237774848938, + "logps/chosen": -404.23663330078125, + "logps/rejected": -461.8468322753906, + "loss": 0.6994, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.012613534927368, + "rewards/margins": 0.565791130065918, + "rewards/rejected": -2.578404664993286, + "step": 1990 + }, + { + "epoch": 0.26, + "grad_norm": 18.75, + "learning_rate": 4.612568186033633e-06, + "logits/chosen": -0.0862119197845459, + "logits/rejected": 1.158911943435669, + "logps/chosen": -453.42706298828125, + "logps/rejected": -497.0005798339844, + "loss": 0.5178, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5579051971435547, + "rewards/margins": 0.8818111419677734, + "rewards/rejected": -2.4397165775299072, + "step": 2000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": 0.8156982064247131, + "eval_logits/rejected": 1.528757929801941, + "eval_logps/chosen": -406.8324279785156, + "eval_logps/rejected": -459.8389587402344, + "eval_loss": 0.5275577306747437, + "eval_rewards/accuracies": 0.7304999828338623, + "eval_rewards/chosen": -1.4221142530441284, + "eval_rewards/margins": 0.7304863333702087, + "eval_rewards/rejected": -2.1526007652282715, + "eval_runtime": 1591.2153, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 2000 + }, + { + "epoch": 0.26, + "grad_norm": 11.6875, + "learning_rate": 4.6064392766618125e-06, + "logits/chosen": 0.2911849617958069, + "logits/rejected": 1.4484639167785645, + "logps/chosen": -415.20855712890625, + "logps/rejected": -454.56243896484375, + "loss": 0.4964, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.474690318107605, + "rewards/margins": 0.811174213886261, + "rewards/rejected": -2.2858645915985107, + "step": 2010 + }, + { + "epoch": 0.26, + "grad_norm": 12.3125, + "learning_rate": 4.60026640807934e-06, + "logits/chosen": 1.1492453813552856, + "logits/rejected": 1.788846731185913, + "logps/chosen": -466.96124267578125, + "logps/rejected": -536.4459228515625, + "loss": 0.5872, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9817111492156982, + "rewards/margins": 0.7023847699165344, + "rewards/rejected": -2.684096097946167, + "step": 2020 + }, + { + "epoch": 0.27, + "grad_norm": 8.9375, + "learning_rate": 4.594049709107604e-06, + "logits/chosen": 1.0384540557861328, + "logits/rejected": 1.8466717004776, + "logps/chosen": -509.6580505371094, + "logps/rejected": -553.58349609375, + "loss": 0.5398, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.156644105911255, + "rewards/margins": 0.8427260518074036, + "rewards/rejected": -2.9993700981140137, + "step": 2030 + }, + { + "epoch": 0.27, + "grad_norm": 9.75, + "learning_rate": 4.587789309482687e-06, + "logits/chosen": 1.0068624019622803, + "logits/rejected": 2.298288583755493, + "logps/chosen": -425.91693115234375, + "logps/rejected": -510.83453369140625, + "loss": 0.4572, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.729448676109314, + "rewards/margins": 0.987391471862793, + "rewards/rejected": -2.7168402671813965, + "step": 2040 + }, + { + "epoch": 0.27, + "grad_norm": 10.4375, + "learning_rate": 4.581485339852659e-06, + "logits/chosen": 1.0009605884552002, + "logits/rejected": 2.157416582107544, + "logps/chosen": -499.15863037109375, + "logps/rejected": -580.7846069335938, + "loss": 0.5363, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.099259853363037, + "rewards/margins": 1.011781930923462, + "rewards/rejected": -3.11104154586792, + "step": 2050 + }, + { + "epoch": 0.27, + "grad_norm": 16.25, + "learning_rate": 4.5751379317748514e-06, + "logits/chosen": 1.6248559951782227, + "logits/rejected": 2.572545051574707, + "logps/chosen": -483.1776428222656, + "logps/rejected": -605.381103515625, + "loss": 0.4643, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4639997482299805, + "rewards/margins": 1.2903715372085571, + "rewards/rejected": -3.754370927810669, + "step": 2060 + }, + { + "epoch": 0.27, + "grad_norm": 17.25, + "learning_rate": 4.56874721771311e-06, + "logits/chosen": 1.5964148044586182, + "logits/rejected": 2.4404406547546387, + "logps/chosen": -534.6270751953125, + "logps/rejected": -651.9520263671875, + "loss": 0.4478, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.535792827606201, + "rewards/margins": 1.3512599468231201, + "rewards/rejected": -3.8870530128479004, + "step": 2070 + }, + { + "epoch": 0.27, + "grad_norm": 20.875, + "learning_rate": 4.562313331035032e-06, + "logits/chosen": 1.3664486408233643, + "logits/rejected": 2.4883315563201904, + "logps/chosen": -552.6138305664062, + "logps/rejected": -656.958251953125, + "loss": 0.4909, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7245543003082275, + "rewards/margins": 1.1670589447021484, + "rewards/rejected": -3.8916125297546387, + "step": 2080 + }, + { + "epoch": 0.27, + "grad_norm": 14.25, + "learning_rate": 4.555836406009183e-06, + "logits/chosen": 0.7134484052658081, + "logits/rejected": 1.9421746730804443, + "logps/chosen": -539.8816528320312, + "logps/rejected": -651.5926513671875, + "loss": 0.4775, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.59112811088562, + "rewards/margins": 1.2943382263183594, + "rewards/rejected": -3.8854668140411377, + "step": 2090 + }, + { + "epoch": 0.27, + "grad_norm": 19.625, + "learning_rate": 4.5493165778022945e-06, + "logits/chosen": 1.732616662979126, + "logits/rejected": 1.5265815258026123, + "logps/chosen": -537.0846557617188, + "logps/rejected": -678.6043701171875, + "loss": 0.524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.924956798553467, + "rewards/margins": 1.0166305303573608, + "rewards/rejected": -3.941587448120117, + "step": 2100 + }, + { + "epoch": 0.27, + "eval_logits/chosen": 1.6624584197998047, + "eval_logits/rejected": 2.595505714416504, + "eval_logps/chosen": -535.6265869140625, + "eval_logps/rejected": -615.3445434570312, + "eval_loss": 0.5663179159164429, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": -2.710054874420166, + "eval_rewards/margins": 0.9976009130477905, + "eval_rewards/rejected": -3.707655906677246, + "eval_runtime": 1591.602, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 2100 + }, + { + "epoch": 0.28, + "grad_norm": 8.875, + "learning_rate": 4.542753982476443e-06, + "logits/chosen": 0.8328291177749634, + "logits/rejected": 1.094390869140625, + "logps/chosen": -505.59417724609375, + "logps/rejected": -693.96630859375, + "loss": 0.3576, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6475024223327637, + "rewards/margins": 1.756042718887329, + "rewards/rejected": -4.403544902801514, + "step": 2110 + }, + { + "epoch": 0.28, + "grad_norm": 22.5, + "learning_rate": 4.53614875698621e-06, + "logits/chosen": 0.9178856015205383, + "logits/rejected": 2.5568594932556152, + "logps/chosen": -729.9873046875, + "logps/rejected": -763.2886962890625, + "loss": 0.9187, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.440108299255371, + "rewards/margins": 1.0146429538726807, + "rewards/rejected": -5.454751491546631, + "step": 2120 + }, + { + "epoch": 0.28, + "grad_norm": 20.5, + "learning_rate": 4.529501039175824e-06, + "logits/chosen": 1.178547978401184, + "logits/rejected": 2.043323516845703, + "logps/chosen": -494.15679931640625, + "logps/rejected": -580.7507934570312, + "loss": 0.5003, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2062692642211914, + "rewards/margins": 1.0896488428115845, + "rewards/rejected": -3.2959182262420654, + "step": 2130 + }, + { + "epoch": 0.28, + "grad_norm": 7.84375, + "learning_rate": 4.522810967776287e-06, + "logits/chosen": 0.9719040989875793, + "logits/rejected": 1.7808208465576172, + "logps/chosen": -494.247802734375, + "logps/rejected": -565.2037963867188, + "loss": 0.5097, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8909088373184204, + "rewards/margins": 0.9046257138252258, + "rewards/rejected": -2.795534610748291, + "step": 2140 + }, + { + "epoch": 0.28, + "grad_norm": 23.25, + "learning_rate": 4.516078682402473e-06, + "logits/chosen": 0.9452457427978516, + "logits/rejected": 2.6888928413391113, + "logps/chosen": -469.39697265625, + "logps/rejected": -555.2576904296875, + "loss": 0.524, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.2237601280212402, + "rewards/margins": 0.8853996396064758, + "rewards/rejected": -3.1091599464416504, + "step": 2150 + }, + { + "epoch": 0.28, + "grad_norm": 24.25, + "learning_rate": 4.509304323550221e-06, + "logits/chosen": 1.5407735109329224, + "logits/rejected": 2.537536859512329, + "logps/chosen": -514.2586669921875, + "logps/rejected": -597.7869873046875, + "loss": 0.5327, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.269601583480835, + "rewards/margins": 1.054694652557373, + "rewards/rejected": -3.324296236038208, + "step": 2160 + }, + { + "epoch": 0.28, + "grad_norm": 9.6875, + "learning_rate": 4.502488032593398e-06, + "logits/chosen": 2.9527904987335205, + "logits/rejected": 3.8193678855895996, + "logps/chosen": -491.11798095703125, + "logps/rejected": -604.221435546875, + "loss": 0.508, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4709322452545166, + "rewards/margins": 1.2204992771148682, + "rewards/rejected": -3.6914315223693848, + "step": 2170 + }, + { + "epoch": 0.29, + "grad_norm": 25.375, + "learning_rate": 4.495629951780951e-06, + "logits/chosen": 2.891505718231201, + "logits/rejected": 2.733696937561035, + "logps/chosen": -567.9174194335938, + "logps/rejected": -632.1881103515625, + "loss": 0.6984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.162008762359619, + "rewards/margins": 0.792600691318512, + "rewards/rejected": -3.9546093940734863, + "step": 2180 + }, + { + "epoch": 0.29, + "grad_norm": 23.125, + "learning_rate": 4.488730224233941e-06, + "logits/chosen": 2.1970956325531006, + "logits/rejected": 2.9282336235046387, + "logps/chosen": -524.8446044921875, + "logps/rejected": -594.5263671875, + "loss": 0.5236, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.49332332611084, + "rewards/margins": 1.0459768772125244, + "rewards/rejected": -3.5393002033233643, + "step": 2190 + }, + { + "epoch": 0.29, + "grad_norm": 22.25, + "learning_rate": 4.481788993942547e-06, + "logits/chosen": 2.2267377376556396, + "logits/rejected": 3.006579875946045, + "logps/chosen": -480.3287048339844, + "logps/rejected": -555.52978515625, + "loss": 0.523, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2022452354431152, + "rewards/margins": 1.0402004718780518, + "rewards/rejected": -3.242445468902588, + "step": 2200 + }, + { + "epoch": 0.29, + "eval_logits/chosen": 2.543581962585449, + "eval_logits/rejected": 3.595458984375, + "eval_logps/chosen": -493.3342590332031, + "eval_logps/rejected": -578.9616088867188, + "eval_loss": 0.5422174334526062, + "eval_rewards/accuracies": 0.7229999899864197, + "eval_rewards/chosen": -2.2871320247650146, + "eval_rewards/margins": 1.0566951036453247, + "eval_rewards/rejected": -3.343827486038208, + "eval_runtime": 1591.1009, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 2200 + }, + { + "epoch": 0.29, + "grad_norm": 16.75, + "learning_rate": 4.474806405763076e-06, + "logits/chosen": 1.3435853719711304, + "logits/rejected": 2.144718885421753, + "logps/chosen": -518.1646728515625, + "logps/rejected": -567.1131591796875, + "loss": 0.6472, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.471111297607422, + "rewards/margins": 0.698428213596344, + "rewards/rejected": -3.169539451599121, + "step": 2210 + }, + { + "epoch": 0.29, + "grad_norm": 15.25, + "learning_rate": 4.4677826054149235e-06, + "logits/chosen": 0.6909424066543579, + "logits/rejected": 1.4299237728118896, + "logps/chosen": -452.12847900390625, + "logps/rejected": -496.40625, + "loss": 0.5732, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.831134557723999, + "rewards/margins": 0.7007611989974976, + "rewards/rejected": -2.531895875930786, + "step": 2220 + }, + { + "epoch": 0.29, + "grad_norm": 7.875, + "learning_rate": 4.460717739477543e-06, + "logits/chosen": 0.33534538745880127, + "logits/rejected": 0.7390525937080383, + "logps/chosen": -426.89727783203125, + "logps/rejected": -453.79998779296875, + "loss": 0.6528, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5729742050170898, + "rewards/margins": 0.42158904671669006, + "rewards/rejected": -1.994563341140747, + "step": 2230 + }, + { + "epoch": 0.29, + "grad_norm": 8.0, + "learning_rate": 4.4536119553873866e-06, + "logits/chosen": -0.13674196600914001, + "logits/rejected": 1.186537742614746, + "logps/chosen": -401.02313232421875, + "logps/rejected": -491.7369689941406, + "loss": 0.4291, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5359418392181396, + "rewards/margins": 1.016803503036499, + "rewards/rejected": -2.5527453422546387, + "step": 2240 + }, + { + "epoch": 0.29, + "grad_norm": 11.8125, + "learning_rate": 4.446465401434824e-06, + "logits/chosen": 1.0150126218795776, + "logits/rejected": 1.48782479763031, + "logps/chosen": -477.7135314941406, + "logps/rejected": -568.2080078125, + "loss": 0.5028, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1706626415252686, + "rewards/margins": 1.0744900703430176, + "rewards/rejected": -3.245152235031128, + "step": 2250 + }, + { + "epoch": 0.3, + "grad_norm": 20.625, + "learning_rate": 4.43927822676105e-06, + "logits/chosen": 1.2589080333709717, + "logits/rejected": 2.024796962738037, + "logps/chosen": -509.18017578125, + "logps/rejected": -593.2972412109375, + "loss": 0.6145, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.477821111679077, + "rewards/margins": 0.9366565942764282, + "rewards/rejected": -3.414477825164795, + "step": 2260 + }, + { + "epoch": 0.3, + "grad_norm": 29.0, + "learning_rate": 4.432050581354972e-06, + "logits/chosen": 0.7604560852050781, + "logits/rejected": 1.928553819656372, + "logps/chosen": -517.0828857421875, + "logps/rejected": -547.6974487304688, + "loss": 0.5421, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4186508655548096, + "rewards/margins": 1.0623056888580322, + "rewards/rejected": -3.480956554412842, + "step": 2270 + }, + { + "epoch": 0.3, + "grad_norm": 14.625, + "learning_rate": 4.424782616050078e-06, + "logits/chosen": 1.0883769989013672, + "logits/rejected": 2.2119033336639404, + "logps/chosen": -459.88299560546875, + "logps/rejected": -527.73193359375, + "loss": 0.474, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.022667169570923, + "rewards/margins": 1.0422003269195557, + "rewards/rejected": -3.0648674964904785, + "step": 2280 + }, + { + "epoch": 0.3, + "grad_norm": 7.65625, + "learning_rate": 4.4174744825212954e-06, + "logits/chosen": 0.5496788024902344, + "logits/rejected": 2.727740526199341, + "logps/chosen": -523.7022094726562, + "logps/rejected": -566.1196899414062, + "loss": 0.4936, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.125478506088257, + "rewards/margins": 1.0277146100997925, + "rewards/rejected": -3.1531929969787598, + "step": 2290 + }, + { + "epoch": 0.3, + "grad_norm": 7.46875, + "learning_rate": 4.410126333281815e-06, + "logits/chosen": 1.8152294158935547, + "logits/rejected": 2.9172585010528564, + "logps/chosen": -488.97698974609375, + "logps/rejected": -560.0440673828125, + "loss": 0.5431, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2743351459503174, + "rewards/margins": 0.9938778877258301, + "rewards/rejected": -3.2682127952575684, + "step": 2300 + }, + { + "epoch": 0.3, + "eval_logits/chosen": 3.20039963722229, + "eval_logits/rejected": 4.24326229095459, + "eval_logps/chosen": -483.9386901855469, + "eval_logps/rejected": -566.412353515625, + "eval_loss": 0.52531898021698, + "eval_rewards/accuracies": 0.734000027179718, + "eval_rewards/chosen": -2.193176507949829, + "eval_rewards/margins": 1.0251572132110596, + "eval_rewards/rejected": -3.2183339595794678, + "eval_runtime": 1591.4881, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 2300 + }, + { + "epoch": 0.3, + "grad_norm": 5.9375, + "learning_rate": 4.402738321679918e-06, + "logits/chosen": 2.252061367034912, + "logits/rejected": 3.324723720550537, + "logps/chosen": -471.62652587890625, + "logps/rejected": -554.5105590820312, + "loss": 0.4985, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8434559106826782, + "rewards/margins": 1.3258936405181885, + "rewards/rejected": -3.169349193572998, + "step": 2310 + }, + { + "epoch": 0.3, + "grad_norm": 16.25, + "learning_rate": 4.395310601895772e-06, + "logits/chosen": 2.3367249965667725, + "logits/rejected": 3.5756278038024902, + "logps/chosen": -453.25933837890625, + "logps/rejected": -523.735107421875, + "loss": 0.4899, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.076174259185791, + "rewards/margins": 1.1950877904891968, + "rewards/rejected": -3.271261692047119, + "step": 2320 + }, + { + "epoch": 0.3, + "grad_norm": 6.40625, + "learning_rate": 4.38784332893821e-06, + "logits/chosen": 2.454847574234009, + "logits/rejected": 3.7376132011413574, + "logps/chosen": -522.7352294921875, + "logps/rejected": -657.8385009765625, + "loss": 0.5333, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.667996883392334, + "rewards/margins": 1.2865402698516846, + "rewards/rejected": -3.9545376300811768, + "step": 2330 + }, + { + "epoch": 0.31, + "grad_norm": 16.0, + "learning_rate": 4.380336658641503e-06, + "logits/chosen": 2.550139904022217, + "logits/rejected": 3.860103130340576, + "logps/chosen": -598.062744140625, + "logps/rejected": -661.4760131835938, + "loss": 0.6555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.558239459991455, + "rewards/margins": 0.8881810307502747, + "rewards/rejected": -4.446420669555664, + "step": 2340 + }, + { + "epoch": 0.31, + "grad_norm": 7.625, + "learning_rate": 4.372790747662101e-06, + "logits/chosen": 3.1860485076904297, + "logits/rejected": 4.244828224182129, + "logps/chosen": -585.7864379882812, + "logps/rejected": -661.202880859375, + "loss": 0.5341, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.656553268432617, + "rewards/margins": 0.9850096702575684, + "rewards/rejected": -4.6415629386901855, + "step": 2350 + }, + { + "epoch": 0.31, + "grad_norm": 19.25, + "learning_rate": 4.365205753475367e-06, + "logits/chosen": 2.958014488220215, + "logits/rejected": 3.7595715522766113, + "logps/chosen": -587.1311645507812, + "logps/rejected": -652.6437377929688, + "loss": 0.5951, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.6158947944641113, + "rewards/margins": 0.7533068656921387, + "rewards/rejected": -4.369201183319092, + "step": 2360 + }, + { + "epoch": 0.31, + "grad_norm": 8.9375, + "learning_rate": 4.35758183437229e-06, + "logits/chosen": 2.5866026878356934, + "logits/rejected": 3.5570175647735596, + "logps/chosen": -560.3768920898438, + "logps/rejected": -637.0328369140625, + "loss": 0.4961, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0859694480895996, + "rewards/margins": 0.9284812211990356, + "rewards/rejected": -4.014450550079346, + "step": 2370 + }, + { + "epoch": 0.31, + "grad_norm": 20.25, + "learning_rate": 4.3499191494561835e-06, + "logits/chosen": 3.0706686973571777, + "logits/rejected": 3.0112204551696777, + "logps/chosen": -586.0211181640625, + "logps/rejected": -692.83056640625, + "loss": 0.5437, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1680941581726074, + "rewards/margins": 0.9448806643486023, + "rewards/rejected": -4.112975120544434, + "step": 2380 + }, + { + "epoch": 0.31, + "grad_norm": 23.0, + "learning_rate": 4.3422178586393615e-06, + "logits/chosen": 3.2278523445129395, + "logits/rejected": 3.951270341873169, + "logps/chosen": -556.1135864257812, + "logps/rejected": -637.8445434570312, + "loss": 0.5511, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0236804485321045, + "rewards/margins": 0.825579822063446, + "rewards/rejected": -3.8492603302001953, + "step": 2390 + }, + { + "epoch": 0.31, + "grad_norm": 23.875, + "learning_rate": 4.334478122639804e-06, + "logits/chosen": 2.0170609951019287, + "logits/rejected": 3.2398910522460938, + "logps/chosen": -566.9010009765625, + "logps/rejected": -622.6240234375, + "loss": 0.5147, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.843874454498291, + "rewards/margins": 0.7945558428764343, + "rewards/rejected": -3.6384308338165283, + "step": 2400 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 3.6861412525177, + "eval_logits/rejected": 4.677193641662598, + "eval_logps/chosen": -549.0342407226562, + "eval_logps/rejected": -632.5286254882812, + "eval_loss": 0.5131849646568298, + "eval_rewards/accuracies": 0.7315000295639038, + "eval_rewards/chosen": -2.8441319465637207, + "eval_rewards/margins": 1.035365104675293, + "eval_rewards/rejected": -3.8794968128204346, + "eval_runtime": 1590.6945, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 2400 + }, + { + "epoch": 0.32, + "grad_norm": 11.375, + "learning_rate": 4.3267001029778015e-06, + "logits/chosen": 1.7796494960784912, + "logits/rejected": 3.778020143508911, + "logps/chosen": -546.0569458007812, + "logps/rejected": -683.8525390625, + "loss": 0.3845, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.809384822845459, + "rewards/margins": 1.5442684888839722, + "rewards/rejected": -4.353653907775879, + "step": 2410 + }, + { + "epoch": 0.32, + "grad_norm": 15.25, + "learning_rate": 4.318883961972585e-06, + "logits/chosen": 2.531733989715576, + "logits/rejected": 3.231806516647339, + "logps/chosen": -588.2597045898438, + "logps/rejected": -721.5579223632812, + "loss": 0.3592, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.820429563522339, + "rewards/margins": 1.508816123008728, + "rewards/rejected": -4.329245567321777, + "step": 2420 + }, + { + "epoch": 0.32, + "grad_norm": 13.625, + "learning_rate": 4.311029862738942e-06, + "logits/chosen": 2.3296523094177246, + "logits/rejected": 4.2262067794799805, + "logps/chosen": -591.3225708007812, + "logps/rejected": -731.8248901367188, + "loss": 0.4765, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.486849308013916, + "rewards/margins": 1.516104817390442, + "rewards/rejected": -5.00295352935791, + "step": 2430 + }, + { + "epoch": 0.32, + "grad_norm": 18.25, + "learning_rate": 4.303137969183804e-06, + "logits/chosen": 3.0578298568725586, + "logits/rejected": 3.899970531463623, + "logps/chosen": -667.5275268554688, + "logps/rejected": -920.3175659179688, + "loss": 0.3092, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.254639625549316, + "rewards/margins": 2.5148143768310547, + "rewards/rejected": -6.7694549560546875, + "step": 2440 + }, + { + "epoch": 0.32, + "grad_norm": 33.5, + "learning_rate": 4.295208446002832e-06, + "logits/chosen": 3.725722551345825, + "logits/rejected": 4.002905368804932, + "logps/chosen": -707.6356811523438, + "logps/rejected": -930.8513793945312, + "loss": 0.5664, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.71712589263916, + "rewards/margins": 1.9741309881210327, + "rewards/rejected": -6.691256523132324, + "step": 2450 + }, + { + "epoch": 0.32, + "grad_norm": 24.125, + "learning_rate": 4.287241458676981e-06, + "logits/chosen": 2.3860459327697754, + "logits/rejected": 3.560899257659912, + "logps/chosen": -657.5707397460938, + "logps/rejected": -731.2125244140625, + "loss": 0.7209, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.795893430709839, + "rewards/margins": 1.0225095748901367, + "rewards/rejected": -4.818403244018555, + "step": 2460 + }, + { + "epoch": 0.32, + "grad_norm": 6.53125, + "learning_rate": 4.279237173469043e-06, + "logits/chosen": 0.9039813876152039, + "logits/rejected": 2.137092351913452, + "logps/chosen": -501.758056640625, + "logps/rejected": -549.93896484375, + "loss": 0.5286, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.340841770172119, + "rewards/margins": 1.0801969766616821, + "rewards/rejected": -3.4210383892059326, + "step": 2470 + }, + { + "epoch": 0.32, + "grad_norm": 8.4375, + "learning_rate": 4.271195757420177e-06, + "logits/chosen": 1.0410025119781494, + "logits/rejected": 1.2670671939849854, + "logps/chosen": -480.5115661621094, + "logps/rejected": -636.4711303710938, + "loss": 0.498, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0763466358184814, + "rewards/margins": 1.0617152452468872, + "rewards/rejected": -3.138062000274658, + "step": 2480 + }, + { + "epoch": 0.33, + "grad_norm": 10.375, + "learning_rate": 4.263117378346425e-06, + "logits/chosen": 1.2630422115325928, + "logits/rejected": 2.5274930000305176, + "logps/chosen": -476.0921325683594, + "logps/rejected": -531.6831665039062, + "loss": 0.5598, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.141141414642334, + "rewards/margins": 1.0511398315429688, + "rewards/rejected": -3.1922812461853027, + "step": 2490 + }, + { + "epoch": 0.33, + "grad_norm": 15.5, + "learning_rate": 4.255002204835208e-06, + "logits/chosen": 1.1449196338653564, + "logits/rejected": 1.7821890115737915, + "logps/chosen": -445.55047607421875, + "logps/rejected": -591.94580078125, + "loss": 0.4198, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1304402351379395, + "rewards/margins": 1.2741626501083374, + "rewards/rejected": -3.4046032428741455, + "step": 2500 + }, + { + "epoch": 0.33, + "eval_logits/chosen": 1.8510550260543823, + "eval_logits/rejected": 2.794989824295044, + "eval_logps/chosen": -482.1783447265625, + "eval_logps/rejected": -559.00537109375, + "eval_loss": 0.5213505029678345, + "eval_rewards/accuracies": 0.7289999723434448, + "eval_rewards/chosen": -2.1755733489990234, + "eval_rewards/margins": 0.9686914086341858, + "eval_rewards/rejected": -3.1442646980285645, + "eval_runtime": 1592.0183, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 2500 + }, + { + "epoch": 0.33, + "grad_norm": 9.625, + "learning_rate": 4.246850406241812e-06, + "logits/chosen": 0.7812983393669128, + "logits/rejected": 1.765454649925232, + "logps/chosen": -497.0265197753906, + "logps/rejected": -563.72216796875, + "loss": 0.4645, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.216409683227539, + "rewards/margins": 0.9073765873908997, + "rewards/rejected": -3.123786211013794, + "step": 2510 + }, + { + "epoch": 0.33, + "grad_norm": 22.125, + "learning_rate": 4.2386621526858465e-06, + "logits/chosen": 1.0584173202514648, + "logits/rejected": 2.6873929500579834, + "logps/chosen": -561.4659423828125, + "logps/rejected": -631.4401245117188, + "loss": 0.5197, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.6689488887786865, + "rewards/margins": 1.138954758644104, + "rewards/rejected": -3.80790376663208, + "step": 2520 + }, + { + "epoch": 0.33, + "grad_norm": 33.0, + "learning_rate": 4.2304376150477015e-06, + "logits/chosen": 1.5355218648910522, + "logits/rejected": 2.0197160243988037, + "logps/chosen": -534.4373779296875, + "logps/rejected": -681.6929931640625, + "loss": 0.485, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7461304664611816, + "rewards/margins": 1.2848045825958252, + "rewards/rejected": -4.030934810638428, + "step": 2530 + }, + { + "epoch": 0.33, + "grad_norm": 16.625, + "learning_rate": 4.222176964964977e-06, + "logits/chosen": 1.1447181701660156, + "logits/rejected": 2.849592685699463, + "logps/chosen": -565.1495361328125, + "logps/rejected": -607.2949829101562, + "loss": 0.7026, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.872459888458252, + "rewards/margins": 0.8781876564025879, + "rewards/rejected": -3.7506473064422607, + "step": 2540 + }, + { + "epoch": 0.33, + "grad_norm": 12.0625, + "learning_rate": 4.213880374828903e-06, + "logits/chosen": 1.039074420928955, + "logits/rejected": 3.4681191444396973, + "logps/chosen": -507.37188720703125, + "logps/rejected": -588.2506713867188, + "loss": 0.3846, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.539095401763916, + "rewards/margins": 1.507993459701538, + "rewards/rejected": -4.047088623046875, + "step": 2550 + }, + { + "epoch": 0.33, + "grad_norm": 16.625, + "learning_rate": 4.2055480177807406e-06, + "logits/chosen": 1.8058216571807861, + "logits/rejected": 2.808535575866699, + "logps/chosen": -577.8333740234375, + "logps/rejected": -715.6800537109375, + "loss": 0.5481, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1704020500183105, + "rewards/margins": 1.268444299697876, + "rewards/rejected": -4.438846588134766, + "step": 2560 + }, + { + "epoch": 0.34, + "grad_norm": 38.25, + "learning_rate": 4.1971800677081696e-06, + "logits/chosen": 1.7406212091445923, + "logits/rejected": 2.802582263946533, + "logps/chosen": -610.0017700195312, + "logps/rejected": -673.1057739257812, + "loss": 0.5932, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.347029209136963, + "rewards/margins": 1.028158187866211, + "rewards/rejected": -4.375187397003174, + "step": 2570 + }, + { + "epoch": 0.34, + "grad_norm": 10.625, + "learning_rate": 4.188776699241661e-06, + "logits/chosen": 0.8740745782852173, + "logits/rejected": 2.8264269828796387, + "logps/chosen": -645.9588012695312, + "logps/rejected": -742.9510498046875, + "loss": 0.4183, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5243937969207764, + "rewards/margins": 1.566954255104065, + "rewards/rejected": -5.091347694396973, + "step": 2580 + }, + { + "epoch": 0.34, + "grad_norm": 14.5625, + "learning_rate": 4.180338087750827e-06, + "logits/chosen": 1.4372971057891846, + "logits/rejected": 2.609794855117798, + "logps/chosen": -664.0543212890625, + "logps/rejected": -711.6912841796875, + "loss": 0.6018, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.798950672149658, + "rewards/margins": 1.0835754871368408, + "rewards/rejected": -4.88252592086792, + "step": 2590 + }, + { + "epoch": 0.34, + "grad_norm": 5.75, + "learning_rate": 4.1718644093407704e-06, + "logits/chosen": 1.4924824237823486, + "logits/rejected": 3.1547598838806152, + "logps/chosen": -639.6531982421875, + "logps/rejected": -705.1632080078125, + "loss": 0.5994, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.5833652019500732, + "rewards/margins": 0.9800311923027039, + "rewards/rejected": -4.563396453857422, + "step": 2600 + }, + { + "epoch": 0.34, + "eval_logits/chosen": 2.444972515106201, + "eval_logits/rejected": 3.4511168003082275, + "eval_logps/chosen": -577.7604370117188, + "eval_logps/rejected": -663.0682983398438, + "eval_loss": 0.5188149809837341, + "eval_rewards/accuracies": 0.7289999723434448, + "eval_rewards/chosen": -3.1313939094543457, + "eval_rewards/margins": 1.0535000562667847, + "eval_rewards/rejected": -4.18489408493042, + "eval_runtime": 1591.8209, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 2600 + }, + { + "epoch": 0.34, + "grad_norm": 9.1875, + "learning_rate": 4.163355840848401e-06, + "logits/chosen": 0.8957176208496094, + "logits/rejected": 2.534245491027832, + "logps/chosen": -527.8390502929688, + "logps/rejected": -632.1031494140625, + "loss": 0.3694, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.750108242034912, + "rewards/margins": 1.2850162982940674, + "rewards/rejected": -4.035124778747559, + "step": 2610 + }, + { + "epoch": 0.34, + "grad_norm": 7.875, + "learning_rate": 4.154812559838748e-06, + "logits/chosen": 1.8887290954589844, + "logits/rejected": 3.2408337593078613, + "logps/chosen": -583.8853149414062, + "logps/rejected": -626.079345703125, + "loss": 0.6086, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.222743511199951, + "rewards/margins": 0.9295894503593445, + "rewards/rejected": -4.1523332595825195, + "step": 2620 + }, + { + "epoch": 0.34, + "grad_norm": 11.0625, + "learning_rate": 4.146234744601259e-06, + "logits/chosen": 1.4272115230560303, + "logits/rejected": 2.444230079650879, + "logps/chosen": -493.49884033203125, + "logps/rejected": -589.1790771484375, + "loss": 0.4433, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5310328006744385, + "rewards/margins": 1.1930855512619019, + "rewards/rejected": -3.72411847114563, + "step": 2630 + }, + { + "epoch": 0.35, + "grad_norm": 14.3125, + "learning_rate": 4.137622574146071e-06, + "logits/chosen": 0.8492835164070129, + "logits/rejected": 1.681236982345581, + "logps/chosen": -459.9530334472656, + "logps/rejected": -513.4580078125, + "loss": 0.5401, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9301316738128662, + "rewards/margins": 0.7894424200057983, + "rewards/rejected": -2.719574451446533, + "step": 2640 + }, + { + "epoch": 0.35, + "grad_norm": 13.375, + "learning_rate": 4.12897622820028e-06, + "logits/chosen": 1.0547001361846924, + "logits/rejected": 2.3231730461120605, + "logps/chosen": -474.957275390625, + "logps/rejected": -489.51934814453125, + "loss": 0.5108, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0215628147125244, + "rewards/margins": 0.9034984707832336, + "rewards/rejected": -2.9250614643096924, + "step": 2650 + }, + { + "epoch": 0.35, + "grad_norm": 14.8125, + "learning_rate": 4.120295887204191e-06, + "logits/chosen": 1.256225824356079, + "logits/rejected": 1.9702694416046143, + "logps/chosen": -473.0255432128906, + "logps/rejected": -558.8773193359375, + "loss": 0.59, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.428616762161255, + "rewards/margins": 0.8970969319343567, + "rewards/rejected": -3.325714111328125, + "step": 2660 + }, + { + "epoch": 0.35, + "grad_norm": 13.5625, + "learning_rate": 4.111581732307548e-06, + "logits/chosen": 0.9635303616523743, + "logits/rejected": 1.6423721313476562, + "logps/chosen": -512.5065307617188, + "logps/rejected": -541.4960327148438, + "loss": 0.5615, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5323760509490967, + "rewards/margins": 0.6085655093193054, + "rewards/rejected": -3.140941619873047, + "step": 2670 + }, + { + "epoch": 0.35, + "grad_norm": 10.6875, + "learning_rate": 4.1028339453657595e-06, + "logits/chosen": 1.0258954763412476, + "logits/rejected": 1.8250477313995361, + "logps/chosen": -522.21484375, + "logps/rejected": -591.0906982421875, + "loss": 0.4589, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.263209819793701, + "rewards/margins": 1.0161672830581665, + "rewards/rejected": -3.2793774604797363, + "step": 2680 + }, + { + "epoch": 0.35, + "grad_norm": 7.1875, + "learning_rate": 4.094052708936096e-06, + "logits/chosen": 1.2521384954452515, + "logits/rejected": 3.028597354888916, + "logps/chosen": -585.7796630859375, + "logps/rejected": -675.7745361328125, + "loss": 0.4939, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0005898475646973, + "rewards/margins": 1.0883712768554688, + "rewards/rejected": -4.088961124420166, + "step": 2690 + }, + { + "epoch": 0.35, + "grad_norm": 9.9375, + "learning_rate": 4.0852382062738874e-06, + "logits/chosen": 1.6026700735092163, + "logits/rejected": 3.266209125518799, + "logps/chosen": -545.3231811523438, + "logps/rejected": -654.74365234375, + "loss": 0.4812, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0284156799316406, + "rewards/margins": 1.3262989521026611, + "rewards/rejected": -4.354714393615723, + "step": 2700 + }, + { + "epoch": 0.35, + "eval_logits/chosen": 2.7916452884674072, + "eval_logits/rejected": 3.776010274887085, + "eval_logps/chosen": -565.985107421875, + "eval_logps/rejected": -655.18212890625, + "eval_loss": 0.5139148235321045, + "eval_rewards/accuracies": 0.7455000281333923, + "eval_rewards/chosen": -3.013641119003296, + "eval_rewards/margins": 1.0923913717269897, + "eval_rewards/rejected": -4.106032371520996, + "eval_runtime": 1592.1095, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 2700 + }, + { + "epoch": 0.35, + "grad_norm": 10.3125, + "learning_rate": 4.076390621328693e-06, + "logits/chosen": 1.6921123266220093, + "logits/rejected": 3.050382137298584, + "logps/chosen": -557.9287719726562, + "logps/rejected": -709.6575317382812, + "loss": 0.378, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.9646167755126953, + "rewards/margins": 1.6953115463256836, + "rewards/rejected": -4.659928321838379, + "step": 2710 + }, + { + "epoch": 0.36, + "grad_norm": 14.0625, + "learning_rate": 4.067510138740467e-06, + "logits/chosen": 1.3908737897872925, + "logits/rejected": 2.611626625061035, + "logps/chosen": -581.0802001953125, + "logps/rejected": -624.2890625, + "loss": 0.5162, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.96773099899292, + "rewards/margins": 1.0839290618896484, + "rewards/rejected": -4.051660060882568, + "step": 2720 + }, + { + "epoch": 0.36, + "grad_norm": 23.25, + "learning_rate": 4.058596943835703e-06, + "logits/chosen": 1.9080839157104492, + "logits/rejected": 2.4143242835998535, + "logps/chosen": -556.7078857421875, + "logps/rejected": -639.7720947265625, + "loss": 0.4337, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0165634155273438, + "rewards/margins": 1.1963437795639038, + "rewards/rejected": -4.212907314300537, + "step": 2730 + }, + { + "epoch": 0.36, + "grad_norm": 15.375, + "learning_rate": 4.049651222623568e-06, + "logits/chosen": 1.776745080947876, + "logits/rejected": 1.8187005519866943, + "logps/chosen": -623.6613159179688, + "logps/rejected": -740.4357299804688, + "loss": 0.5837, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.6197190284729004, + "rewards/margins": 1.086064338684082, + "rewards/rejected": -4.705783843994141, + "step": 2740 + }, + { + "epoch": 0.36, + "grad_norm": 13.0625, + "learning_rate": 4.040673161792014e-06, + "logits/chosen": 1.0641945600509644, + "logits/rejected": 2.175415277481079, + "logps/chosen": -627.2669677734375, + "logps/rejected": -683.8802490234375, + "loss": 0.5289, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2108218669891357, + "rewards/margins": 0.9524551630020142, + "rewards/rejected": -4.163276672363281, + "step": 2750 + }, + { + "epoch": 0.36, + "grad_norm": 12.0625, + "learning_rate": 4.031662948703896e-06, + "logits/chosen": 1.3657619953155518, + "logits/rejected": 2.5233426094055176, + "logps/chosen": -588.10595703125, + "logps/rejected": -648.07666015625, + "loss": 0.5145, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9469008445739746, + "rewards/margins": 0.9836384057998657, + "rewards/rejected": -3.93053936958313, + "step": 2760 + }, + { + "epoch": 0.36, + "grad_norm": 14.4375, + "learning_rate": 4.022620771393047e-06, + "logits/chosen": 1.4988666772842407, + "logits/rejected": 2.586674213409424, + "logps/chosen": -627.62744140625, + "logps/rejected": -679.7816162109375, + "loss": 0.5464, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.4070639610290527, + "rewards/margins": 0.8845176696777344, + "rewards/rejected": -4.291582107543945, + "step": 2770 + }, + { + "epoch": 0.36, + "grad_norm": 13.3125, + "learning_rate": 4.013546818560362e-06, + "logits/chosen": 1.5694141387939453, + "logits/rejected": 2.882098436355591, + "logps/chosen": -605.7820434570312, + "logps/rejected": -676.2273559570312, + "loss": 0.4648, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2976009845733643, + "rewards/margins": 1.179705023765564, + "rewards/rejected": -4.4773054122924805, + "step": 2780 + }, + { + "epoch": 0.37, + "grad_norm": 17.75, + "learning_rate": 4.00444127956986e-06, + "logits/chosen": 1.1843597888946533, + "logits/rejected": 2.589926242828369, + "logps/chosen": -620.7827758789062, + "logps/rejected": -640.5551147460938, + "loss": 0.5387, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.944465160369873, + "rewards/margins": 0.9514845609664917, + "rewards/rejected": -3.895949602127075, + "step": 2790 + }, + { + "epoch": 0.37, + "grad_norm": 7.25, + "learning_rate": 3.9953043444447255e-06, + "logits/chosen": 1.532142996788025, + "logits/rejected": 2.036409854888916, + "logps/chosen": -530.7525634765625, + "logps/rejected": -674.3682861328125, + "loss": 0.4696, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.759455680847168, + "rewards/margins": 1.3281762599945068, + "rewards/rejected": -4.087632179260254, + "step": 2800 + }, + { + "epoch": 0.37, + "eval_logits/chosen": 1.8288776874542236, + "eval_logits/rejected": 2.6756598949432373, + "eval_logps/chosen": -487.6709289550781, + "eval_logps/rejected": -568.2573852539062, + "eval_loss": 0.5136556625366211, + "eval_rewards/accuracies": 0.7354999780654907, + "eval_rewards/chosen": -2.230499505996704, + "eval_rewards/margins": 1.0062857866287231, + "eval_rewards/rejected": -3.2367851734161377, + "eval_runtime": 1591.8133, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 2800 + }, + { + "epoch": 0.37, + "grad_norm": 8.125, + "learning_rate": 3.986136203863355e-06, + "logits/chosen": 0.38360413908958435, + "logits/rejected": 1.1356967687606812, + "logps/chosen": -421.0419006347656, + "logps/rejected": -479.0397033691406, + "loss": 0.5937, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9757299423217773, + "rewards/margins": 0.8018991351127625, + "rewards/rejected": -2.7776291370391846, + "step": 2810 + }, + { + "epoch": 0.37, + "grad_norm": 17.625, + "learning_rate": 3.976937049155365e-06, + "logits/chosen": -0.2097533941268921, + "logits/rejected": 0.9864130020141602, + "logps/chosen": -423.400390625, + "logps/rejected": -491.6302185058594, + "loss": 0.5371, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.563111424446106, + "rewards/margins": 0.8165823817253113, + "rewards/rejected": -2.3796939849853516, + "step": 2820 + }, + { + "epoch": 0.37, + "grad_norm": 13.8125, + "learning_rate": 3.967707072297608e-06, + "logits/chosen": 0.0686485767364502, + "logits/rejected": 0.5922690629959106, + "logps/chosen": -426.5838317871094, + "logps/rejected": -479.63543701171875, + "loss": 0.575, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5743776559829712, + "rewards/margins": 0.6596744656562805, + "rewards/rejected": -2.2340521812438965, + "step": 2830 + }, + { + "epoch": 0.37, + "grad_norm": 24.75, + "learning_rate": 3.958446465910159e-06, + "logits/chosen": 0.03275877237319946, + "logits/rejected": 1.1897265911102295, + "logps/chosen": -419.6893615722656, + "logps/rejected": -487.18505859375, + "loss": 0.4558, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.679578185081482, + "rewards/margins": 1.0936287641525269, + "rewards/rejected": -2.773206949234009, + "step": 2840 + }, + { + "epoch": 0.37, + "grad_norm": 30.25, + "learning_rate": 3.9491554232523066e-06, + "logits/chosen": 1.2540711164474487, + "logits/rejected": 1.7490119934082031, + "logps/chosen": -571.7868041992188, + "logps/rejected": -663.6853637695312, + "loss": 0.505, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0915579795837402, + "rewards/margins": 1.137508511543274, + "rewards/rejected": -4.229066371917725, + "step": 2850 + }, + { + "epoch": 0.37, + "grad_norm": 8.8125, + "learning_rate": 3.939834138218505e-06, + "logits/chosen": 1.5485732555389404, + "logits/rejected": 2.4029109477996826, + "logps/chosen": -579.219970703125, + "logps/rejected": -664.6459350585938, + "loss": 0.6115, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1892542839050293, + "rewards/margins": 1.0732507705688477, + "rewards/rejected": -4.262505054473877, + "step": 2860 + }, + { + "epoch": 0.38, + "grad_norm": 20.75, + "learning_rate": 3.930482805334339e-06, + "logits/chosen": 0.921392560005188, + "logits/rejected": 1.3949135541915894, + "logps/chosen": -425.5755310058594, + "logps/rejected": -598.9391479492188, + "loss": 0.4478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.347729206085205, + "rewards/margins": 1.4377492666244507, + "rewards/rejected": -3.7854785919189453, + "step": 2870 + }, + { + "epoch": 0.38, + "grad_norm": 9.8125, + "learning_rate": 3.921101619752464e-06, + "logits/chosen": 0.45665979385375977, + "logits/rejected": 1.0926839113235474, + "logps/chosen": -460.4278259277344, + "logps/rejected": -552.573486328125, + "loss": 0.4864, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.01448917388916, + "rewards/margins": 1.2656866312026978, + "rewards/rejected": -3.2801756858825684, + "step": 2880 + }, + { + "epoch": 0.38, + "grad_norm": 10.6875, + "learning_rate": 3.911690777248525e-06, + "logits/chosen": 0.17906469106674194, + "logits/rejected": 0.6382294297218323, + "logps/chosen": -469.03167724609375, + "logps/rejected": -578.2459716796875, + "loss": 0.494, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.113701581954956, + "rewards/margins": 1.1188174486160278, + "rewards/rejected": -3.2325191497802734, + "step": 2890 + }, + { + "epoch": 0.38, + "grad_norm": 21.875, + "learning_rate": 3.902250474217079e-06, + "logits/chosen": -0.15756431221961975, + "logits/rejected": 1.098966360092163, + "logps/chosen": -430.4964294433594, + "logps/rejected": -522.6734619140625, + "loss": 0.5418, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.887865662574768, + "rewards/margins": 1.009726643562317, + "rewards/rejected": -2.897592067718506, + "step": 2900 + }, + { + "epoch": 0.38, + "eval_logits/chosen": 1.189926028251648, + "eval_logits/rejected": 2.0189223289489746, + "eval_logps/chosen": -471.02703857421875, + "eval_logps/rejected": -559.2019653320312, + "eval_loss": 0.5176796317100525, + "eval_rewards/accuracies": 0.734499990940094, + "eval_rewards/chosen": -2.0640602111816406, + "eval_rewards/margins": 1.0821698904037476, + "eval_rewards/rejected": -3.1462302207946777, + "eval_runtime": 1591.2862, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 2900 + }, + { + "epoch": 0.38, + "grad_norm": 9.75, + "learning_rate": 3.892780907667495e-06, + "logits/chosen": 0.06123056262731552, + "logits/rejected": 0.9262319803237915, + "logps/chosen": -484.229248046875, + "logps/rejected": -581.5013427734375, + "loss": 0.4237, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.977403998374939, + "rewards/margins": 1.2522307634353638, + "rewards/rejected": -3.229635238647461, + "step": 2910 + }, + { + "epoch": 0.38, + "grad_norm": 11.5625, + "learning_rate": 3.883282275219837e-06, + "logits/chosen": 0.5973024368286133, + "logits/rejected": 1.3094433546066284, + "logps/chosen": -473.7518005371094, + "logps/rejected": -624.4464111328125, + "loss": 0.4704, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3923592567443848, + "rewards/margins": 1.4723341464996338, + "rewards/rejected": -3.8646934032440186, + "step": 2920 + }, + { + "epoch": 0.38, + "grad_norm": 7.21875, + "learning_rate": 3.873754775100751e-06, + "logits/chosen": 0.2925790548324585, + "logits/rejected": 1.3715248107910156, + "logps/chosen": -503.6017150878906, + "logps/rejected": -597.2714233398438, + "loss": 0.4469, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3737165927886963, + "rewards/margins": 1.3970870971679688, + "rewards/rejected": -3.770803928375244, + "step": 2930 + }, + { + "epoch": 0.38, + "grad_norm": 22.0, + "learning_rate": 3.8641986061393145e-06, + "logits/chosen": 0.028101766481995583, + "logits/rejected": 1.5201627016067505, + "logps/chosen": -550.6173095703125, + "logps/rejected": -695.3499755859375, + "loss": 0.4544, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7320361137390137, + "rewards/margins": 1.6946220397949219, + "rewards/rejected": -4.426657676696777, + "step": 2940 + }, + { + "epoch": 0.39, + "grad_norm": 15.875, + "learning_rate": 3.854613967762898e-06, + "logits/chosen": 0.7224695682525635, + "logits/rejected": 1.902295470237732, + "logps/chosen": -589.226318359375, + "logps/rejected": -696.6246337890625, + "loss": 0.4663, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0647222995758057, + "rewards/margins": 1.5787670612335205, + "rewards/rejected": -4.643489360809326, + "step": 2950 + }, + { + "epoch": 0.39, + "grad_norm": 24.5, + "learning_rate": 3.845001059992999e-06, + "logits/chosen": 1.1261330842971802, + "logits/rejected": 2.1730661392211914, + "logps/chosen": -602.8712768554688, + "logps/rejected": -735.8431396484375, + "loss": 0.4671, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3659489154815674, + "rewards/margins": 1.5933061838150024, + "rewards/rejected": -4.959255218505859, + "step": 2960 + }, + { + "epoch": 0.39, + "grad_norm": 16.75, + "learning_rate": 3.835360083441067e-06, + "logits/chosen": 0.768014132976532, + "logits/rejected": 1.7233359813690186, + "logps/chosen": -632.5802612304688, + "logps/rejected": -771.4032592773438, + "loss": 0.4379, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.644712448120117, + "rewards/margins": 1.5702012777328491, + "rewards/rejected": -5.214913368225098, + "step": 2970 + }, + { + "epoch": 0.39, + "grad_norm": 18.0, + "learning_rate": 3.825691239304318e-06, + "logits/chosen": 0.6955349445343018, + "logits/rejected": 2.1932034492492676, + "logps/chosen": -729.919189453125, + "logps/rejected": -745.0521240234375, + "loss": 0.7656, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9248478412628174, + "rewards/margins": 0.9639946222305298, + "rewards/rejected": -4.888842582702637, + "step": 2980 + }, + { + "epoch": 0.39, + "grad_norm": 26.875, + "learning_rate": 3.8159947293615385e-06, + "logits/chosen": 0.7099810838699341, + "logits/rejected": 1.620305061340332, + "logps/chosen": -573.0277099609375, + "logps/rejected": -639.2322998046875, + "loss": 0.4878, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8682754039764404, + "rewards/margins": 1.046812653541565, + "rewards/rejected": -3.915087938308716, + "step": 2990 + }, + { + "epoch": 0.39, + "grad_norm": 15.875, + "learning_rate": 3.806270755968866e-06, + "logits/chosen": 1.8222758769989014, + "logits/rejected": 2.27435040473938, + "logps/chosen": -488.9098205566406, + "logps/rejected": -600.2335815429688, + "loss": 0.5068, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6705269813537598, + "rewards/margins": 1.1377004384994507, + "rewards/rejected": -3.808227062225342, + "step": 3000 + }, + { + "epoch": 0.39, + "eval_logits/chosen": 2.0023179054260254, + "eval_logits/rejected": 2.8678812980651855, + "eval_logps/chosen": -510.2568664550781, + "eval_logps/rejected": -601.0542602539062, + "eval_loss": 0.5096300840377808, + "eval_rewards/accuracies": 0.7400000095367432, + "eval_rewards/chosen": -2.4563584327697754, + "eval_rewards/margins": 1.1083952188491821, + "eval_rewards/rejected": -3.564753770828247, + "eval_runtime": 1591.3168, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 3000 + }, + { + "epoch": 0.39, + "grad_norm": 13.3125, + "learning_rate": 3.7965195220555784e-06, + "logits/chosen": 1.7134437561035156, + "logits/rejected": 2.0564117431640625, + "logps/chosen": -479.67449951171875, + "logps/rejected": -604.8666381835938, + "loss": 0.5536, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6466455459594727, + "rewards/margins": 1.1505428552627563, + "rewards/rejected": -3.7971885204315186, + "step": 3010 + }, + { + "epoch": 0.4, + "grad_norm": 12.0, + "learning_rate": 3.786741231119847e-06, + "logits/chosen": 0.3799988329410553, + "logits/rejected": 1.70327889919281, + "logps/chosen": -503.31268310546875, + "logps/rejected": -576.9793701171875, + "loss": 0.4446, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9507548809051514, + "rewards/margins": 1.2016483545303345, + "rewards/rejected": -3.1524033546447754, + "step": 3020 + }, + { + "epoch": 0.4, + "grad_norm": 5.53125, + "learning_rate": 3.7769360872244992e-06, + "logits/chosen": 1.170353889465332, + "logits/rejected": 1.2406959533691406, + "logps/chosen": -431.1206970214844, + "logps/rejected": -513.315185546875, + "loss": 0.4801, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9646472930908203, + "rewards/margins": 0.9679223895072937, + "rewards/rejected": -2.932569980621338, + "step": 3030 + }, + { + "epoch": 0.4, + "grad_norm": 15.875, + "learning_rate": 3.767104294992754e-06, + "logits/chosen": 0.9803180694580078, + "logits/rejected": 2.352583885192871, + "logps/chosen": -454.54541015625, + "logps/rejected": -532.4876098632812, + "loss": 0.46, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.114039659500122, + "rewards/margins": 1.1198477745056152, + "rewards/rejected": -3.2338874340057373, + "step": 3040 + }, + { + "epoch": 0.4, + "grad_norm": 8.0, + "learning_rate": 3.7572460596039524e-06, + "logits/chosen": 0.7494764924049377, + "logits/rejected": 1.6844794750213623, + "logps/chosen": -467.81005859375, + "logps/rejected": -567.791748046875, + "loss": 0.5172, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2958292961120605, + "rewards/margins": 1.046289086341858, + "rewards/rejected": -3.342118501663208, + "step": 3050 + }, + { + "epoch": 0.4, + "grad_norm": 4.8125, + "learning_rate": 3.74736158678928e-06, + "logits/chosen": 0.8211283683776855, + "logits/rejected": 1.3175485134124756, + "logps/chosen": -428.0367736816406, + "logps/rejected": -535.9208374023438, + "loss": 0.4252, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8630352020263672, + "rewards/margins": 1.1972407102584839, + "rewards/rejected": -3.0602760314941406, + "step": 3060 + }, + { + "epoch": 0.4, + "grad_norm": 8.875, + "learning_rate": 3.7374510828274673e-06, + "logits/chosen": 0.7421783208847046, + "logits/rejected": 1.331279993057251, + "logps/chosen": -464.83795166015625, + "logps/rejected": -571.86767578125, + "loss": 0.5101, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2767622470855713, + "rewards/margins": 1.0495450496673584, + "rewards/rejected": -3.3263068199157715, + "step": 3070 + }, + { + "epoch": 0.4, + "grad_norm": 14.125, + "learning_rate": 3.72751475454049e-06, + "logits/chosen": 0.8095995783805847, + "logits/rejected": 1.2267462015151978, + "logps/chosen": -514.5219116210938, + "logps/rejected": -657.3064575195312, + "loss": 0.5369, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.805443048477173, + "rewards/margins": 1.2120649814605713, + "rewards/rejected": -4.017508029937744, + "step": 3080 + }, + { + "epoch": 0.4, + "grad_norm": 37.0, + "learning_rate": 3.7175528092892503e-06, + "logits/chosen": 0.6734537482261658, + "logits/rejected": 0.9628368616104126, + "logps/chosen": -592.8623046875, + "logps/rejected": -679.498046875, + "loss": 0.6657, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.2352747917175293, + "rewards/margins": 0.9270402789115906, + "rewards/rejected": -4.162315368652344, + "step": 3090 + }, + { + "epoch": 0.41, + "grad_norm": 15.9375, + "learning_rate": 3.7075654549692498e-06, + "logits/chosen": 0.25260859727859497, + "logits/rejected": 1.0015579462051392, + "logps/chosen": -547.7056884765625, + "logps/rejected": -655.2227783203125, + "loss": 0.4429, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.885751962661743, + "rewards/margins": 1.3275845050811768, + "rewards/rejected": -4.213336944580078, + "step": 3100 + }, + { + "epoch": 0.41, + "eval_logits/chosen": 0.6491054892539978, + "eval_logits/rejected": 1.3309341669082642, + "eval_logps/chosen": -540.056640625, + "eval_logps/rejected": -633.2681884765625, + "eval_loss": 0.5323993563652039, + "eval_rewards/accuracies": 0.7179999947547913, + "eval_rewards/chosen": -2.7543554306030273, + "eval_rewards/margins": 1.1325373649597168, + "eval_rewards/rejected": -3.886892557144165, + "eval_runtime": 1591.7657, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 3100 + }, + { + "epoch": 0.41, + "grad_norm": 13.1875, + "learning_rate": 3.697552900006249e-06, + "logits/chosen": 0.2841110825538635, + "logits/rejected": 0.6535122990608215, + "logps/chosen": -558.1544189453125, + "logps/rejected": -608.4871826171875, + "loss": 0.5408, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.807015895843506, + "rewards/margins": 0.8503965139389038, + "rewards/rejected": -3.65741229057312, + "step": 3110 + }, + { + "epoch": 0.41, + "grad_norm": 10.625, + "learning_rate": 3.6875153533519244e-06, + "logits/chosen": 0.0682436004281044, + "logits/rejected": 0.5094722509384155, + "logps/chosen": -544.4560546875, + "logps/rejected": -663.2474975585938, + "loss": 0.5958, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9370124340057373, + "rewards/margins": 0.9294888377189636, + "rewards/rejected": -3.8665013313293457, + "step": 3120 + }, + { + "epoch": 0.41, + "grad_norm": 49.75, + "learning_rate": 3.6774530244794992e-06, + "logits/chosen": 0.13949742913246155, + "logits/rejected": 0.7011431455612183, + "logps/chosen": -490.24969482421875, + "logps/rejected": -611.5720825195312, + "loss": 0.5313, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.532118797302246, + "rewards/margins": 1.2002549171447754, + "rewards/rejected": -3.7323734760284424, + "step": 3130 + }, + { + "epoch": 0.41, + "grad_norm": 10.75, + "learning_rate": 3.667366123379378e-06, + "logits/chosen": 0.017296016216278076, + "logits/rejected": 0.8059386014938354, + "logps/chosen": -509.57171630859375, + "logps/rejected": -552.8974609375, + "loss": 0.5477, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5200209617614746, + "rewards/margins": 0.8663853406906128, + "rewards/rejected": -3.3864059448242188, + "step": 3140 + }, + { + "epoch": 0.41, + "grad_norm": 10.5625, + "learning_rate": 3.6572548605547607e-06, + "logits/chosen": 0.2774070203304291, + "logits/rejected": 1.5602456331253052, + "logps/chosen": -511.4872131347656, + "logps/rejected": -610.0732421875, + "loss": 0.3978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.307269811630249, + "rewards/margins": 1.3458932638168335, + "rewards/rejected": -3.653163194656372, + "step": 3150 + }, + { + "epoch": 0.41, + "grad_norm": 12.0625, + "learning_rate": 3.6471194470172538e-06, + "logits/chosen": 0.7748087644577026, + "logits/rejected": 1.4111305475234985, + "logps/chosen": -550.489990234375, + "logps/rejected": -685.2496948242188, + "loss": 0.4381, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.574678421020508, + "rewards/margins": 1.5018852949142456, + "rewards/rejected": -4.076563358306885, + "step": 3160 + }, + { + "epoch": 0.41, + "grad_norm": 19.25, + "learning_rate": 3.636960094282461e-06, + "logits/chosen": 0.6883363723754883, + "logits/rejected": 1.5032721757888794, + "logps/chosen": -598.4791870117188, + "logps/rejected": -718.0567626953125, + "loss": 0.4243, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.152695894241333, + "rewards/margins": 1.4088233709335327, + "rewards/rejected": -4.561519145965576, + "step": 3170 + }, + { + "epoch": 0.42, + "grad_norm": 13.375, + "learning_rate": 3.6267770143655743e-06, + "logits/chosen": 0.7542746067047119, + "logits/rejected": 2.28153395652771, + "logps/chosen": -606.701171875, + "logps/rejected": -643.6585693359375, + "loss": 0.6149, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.396790027618408, + "rewards/margins": 1.0983827114105225, + "rewards/rejected": -4.49517297744751, + "step": 3180 + }, + { + "epoch": 0.42, + "grad_norm": 15.6875, + "learning_rate": 3.6165704197769484e-06, + "logits/chosen": 0.7957227230072021, + "logits/rejected": 1.6306054592132568, + "logps/chosen": -591.9930419921875, + "logps/rejected": -690.844970703125, + "loss": 0.4823, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.360551118850708, + "rewards/margins": 1.256277322769165, + "rewards/rejected": -4.616828441619873, + "step": 3190 + }, + { + "epoch": 0.42, + "grad_norm": 6.9375, + "learning_rate": 3.606340523517663e-06, + "logits/chosen": 0.30117177963256836, + "logits/rejected": 1.471394419670105, + "logps/chosen": -613.8837890625, + "logps/rejected": -644.7623291015625, + "loss": 0.5977, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.912050724029541, + "rewards/margins": 0.9650869369506836, + "rewards/rejected": -3.8771374225616455, + "step": 3200 + }, + { + "epoch": 0.42, + "eval_logits/chosen": 1.2328182458877563, + "eval_logits/rejected": 2.0169761180877686, + "eval_logps/chosen": -553.0416259765625, + "eval_logps/rejected": -642.8284912109375, + "eval_loss": 0.4963167607784271, + "eval_rewards/accuracies": 0.7425000071525574, + "eval_rewards/chosen": -2.8842058181762695, + "eval_rewards/margins": 1.0982892513275146, + "eval_rewards/rejected": -3.9824953079223633, + "eval_runtime": 1591.8278, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 3200 + }, + { + "epoch": 0.42, + "grad_norm": 11.5, + "learning_rate": 3.5960875390750793e-06, + "logits/chosen": 0.5907621383666992, + "logits/rejected": 1.5867271423339844, + "logps/chosen": -515.3133544921875, + "logps/rejected": -595.2828979492188, + "loss": 0.4665, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7821831703186035, + "rewards/margins": 1.0895627737045288, + "rewards/rejected": -3.871746063232422, + "step": 3210 + }, + { + "epoch": 0.42, + "grad_norm": 14.1875, + "learning_rate": 3.585811680418386e-06, + "logits/chosen": 0.5541056394577026, + "logits/rejected": 1.5352404117584229, + "logps/chosen": -594.6611328125, + "logps/rejected": -660.6251831054688, + "loss": 0.5488, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.016439914703369, + "rewards/margins": 1.1275527477264404, + "rewards/rejected": -4.1439924240112305, + "step": 3220 + }, + { + "epoch": 0.42, + "grad_norm": 25.125, + "learning_rate": 3.5755131619941347e-06, + "logits/chosen": 0.729204535484314, + "logits/rejected": 1.6516637802124023, + "logps/chosen": -512.8201904296875, + "logps/rejected": -624.3805541992188, + "loss": 0.5408, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7431254386901855, + "rewards/margins": 1.1400768756866455, + "rewards/rejected": -3.883202314376831, + "step": 3230 + }, + { + "epoch": 0.42, + "grad_norm": 15.5, + "learning_rate": 3.565192198721759e-06, + "logits/chosen": 0.37775200605392456, + "logits/rejected": 2.0130581855773926, + "logps/chosen": -549.7015380859375, + "logps/rejected": -609.7913208007812, + "loss": 0.4715, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.597252368927002, + "rewards/margins": 1.3094682693481445, + "rewards/rejected": -3.9067211151123047, + "step": 3240 + }, + { + "epoch": 0.43, + "grad_norm": 13.5, + "learning_rate": 3.5548490059890965e-06, + "logits/chosen": 0.22869491577148438, + "logits/rejected": 1.187572717666626, + "logps/chosen": -550.7482299804688, + "logps/rejected": -615.3689575195312, + "loss": 0.5063, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6587748527526855, + "rewards/margins": 1.0318644046783447, + "rewards/rejected": -3.690639019012451, + "step": 3250 + }, + { + "epoch": 0.43, + "grad_norm": 12.8125, + "learning_rate": 3.5444837996478903e-06, + "logits/chosen": 0.3918129503726959, + "logits/rejected": 0.8749796152114868, + "logps/chosen": -483.89794921875, + "logps/rejected": -601.3150634765625, + "loss": 0.4459, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3012442588806152, + "rewards/margins": 1.3543719053268433, + "rewards/rejected": -3.655616044998169, + "step": 3260 + }, + { + "epoch": 0.43, + "grad_norm": 26.125, + "learning_rate": 3.534096796009282e-06, + "logits/chosen": 0.4137166440486908, + "logits/rejected": 1.1227707862854004, + "logps/chosen": -560.236083984375, + "logps/rejected": -709.2138671875, + "loss": 0.5595, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.937579393386841, + "rewards/margins": 1.2887606620788574, + "rewards/rejected": -4.226339817047119, + "step": 3270 + }, + { + "epoch": 0.43, + "grad_norm": 8.3125, + "learning_rate": 3.5236882118393046e-06, + "logits/chosen": 0.8245857357978821, + "logits/rejected": 1.0247094631195068, + "logps/chosen": -529.5103149414062, + "logps/rejected": -669.4229125976562, + "loss": 0.4461, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.791597604751587, + "rewards/margins": 1.3754332065582275, + "rewards/rejected": -4.1670308113098145, + "step": 3280 + }, + { + "epoch": 0.43, + "grad_norm": 15.875, + "learning_rate": 3.5132582643543513e-06, + "logits/chosen": -0.049818553030490875, + "logits/rejected": 0.3968813717365265, + "logps/chosen": -564.7365112304688, + "logps/rejected": -670.1153564453125, + "loss": 0.5883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8896031379699707, + "rewards/margins": 1.0605024099349976, + "rewards/rejected": -3.950105667114258, + "step": 3290 + }, + { + "epoch": 0.43, + "grad_norm": 12.25, + "learning_rate": 3.5028071712166456e-06, + "logits/chosen": -0.1731199026107788, + "logits/rejected": 0.6234968900680542, + "logps/chosen": -515.9782104492188, + "logps/rejected": -608.0538330078125, + "loss": 0.5281, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5141870975494385, + "rewards/margins": 1.0168650150299072, + "rewards/rejected": -3.5310521125793457, + "step": 3300 + }, + { + "epoch": 0.43, + "eval_logits/chosen": 0.4294242560863495, + "eval_logits/rejected": 1.1826006174087524, + "eval_logps/chosen": -507.1646728515625, + "eval_logps/rejected": -599.690673828125, + "eval_loss": 0.5074065327644348, + "eval_rewards/accuracies": 0.7325000166893005, + "eval_rewards/chosen": -2.425436019897461, + "eval_rewards/margins": 1.1256811618804932, + "eval_rewards/rejected": -3.551117181777954, + "eval_runtime": 1590.7186, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 3300 + }, + { + "epoch": 0.43, + "grad_norm": 11.0625, + "learning_rate": 3.4923351505297008e-06, + "logits/chosen": 0.23522624373435974, + "logits/rejected": 0.05815122276544571, + "logps/chosen": -443.71710205078125, + "logps/rejected": -581.3840942382812, + "loss": 0.5672, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.257312059402466, + "rewards/margins": 0.9799342155456543, + "rewards/rejected": -3.23724627494812, + "step": 3310 + }, + { + "epoch": 0.43, + "grad_norm": 22.5, + "learning_rate": 3.481842420833766e-06, + "logits/chosen": -0.09912939369678497, + "logits/rejected": 0.4761788249015808, + "logps/chosen": -439.24090576171875, + "logps/rejected": -557.4966430664062, + "loss": 0.5156, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2463881969451904, + "rewards/margins": 1.1034013032913208, + "rewards/rejected": -3.34978985786438, + "step": 3320 + }, + { + "epoch": 0.44, + "grad_norm": 20.75, + "learning_rate": 3.4713292011012645e-06, + "logits/chosen": -0.19417408108711243, + "logits/rejected": 0.4867437481880188, + "logps/chosen": -598.3062133789062, + "logps/rejected": -665.0530395507812, + "loss": 0.5448, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8582839965820312, + "rewards/margins": 1.1232712268829346, + "rewards/rejected": -3.981555223464966, + "step": 3330 + }, + { + "epoch": 0.44, + "grad_norm": 36.75, + "learning_rate": 3.4607957107322277e-06, + "logits/chosen": 0.16228322684764862, + "logits/rejected": 0.97930908203125, + "logps/chosen": -601.2854614257812, + "logps/rejected": -665.2420043945312, + "loss": 0.5648, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2820160388946533, + "rewards/margins": 0.9795142412185669, + "rewards/rejected": -4.261530876159668, + "step": 3340 + }, + { + "epoch": 0.44, + "grad_norm": 7.1875, + "learning_rate": 3.4502421695497112e-06, + "logits/chosen": 0.13692227005958557, + "logits/rejected": 1.0607303380966187, + "logps/chosen": -552.9117431640625, + "logps/rejected": -664.9490356445312, + "loss": 0.4744, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.911181688308716, + "rewards/margins": 1.3374974727630615, + "rewards/rejected": -4.248679161071777, + "step": 3350 + }, + { + "epoch": 0.44, + "grad_norm": 10.125, + "learning_rate": 3.4396687977952137e-06, + "logits/chosen": 0.16164417564868927, + "logits/rejected": 0.38801008462905884, + "logps/chosen": -572.5013427734375, + "logps/rejected": -685.3876342773438, + "loss": 0.4979, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.01336407661438, + "rewards/margins": 1.1361128091812134, + "rewards/rejected": -4.149477481842041, + "step": 3360 + }, + { + "epoch": 0.44, + "grad_norm": 7.78125, + "learning_rate": 3.429075816124075e-06, + "logits/chosen": -0.2848863899707794, + "logits/rejected": 0.6865512132644653, + "logps/chosen": -520.8642578125, + "logps/rejected": -632.9857177734375, + "loss": 0.3749, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5089519023895264, + "rewards/margins": 1.552814245223999, + "rewards/rejected": -4.061766147613525, + "step": 3370 + }, + { + "epoch": 0.44, + "grad_norm": 8.875, + "learning_rate": 3.418463445600874e-06, + "logits/chosen": 0.7481366991996765, + "logits/rejected": 1.7227299213409424, + "logps/chosen": -542.5957641601562, + "logps/rejected": -732.8658447265625, + "loss": 0.387, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3663573265075684, + "rewards/margins": 1.6481434106826782, + "rewards/rejected": -5.014500617980957, + "step": 3380 + }, + { + "epoch": 0.44, + "grad_norm": 8.3125, + "learning_rate": 3.4078319076948173e-06, + "logits/chosen": 0.3249647617340088, + "logits/rejected": 0.9782923460006714, + "logps/chosen": -604.2789916992188, + "logps/rejected": -669.7869873046875, + "loss": 0.5856, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1589674949645996, + "rewards/margins": 0.9365655779838562, + "rewards/rejected": -4.095533847808838, + "step": 3390 + }, + { + "epoch": 0.44, + "grad_norm": 11.6875, + "learning_rate": 3.3971814242751123e-06, + "logits/chosen": 0.7070889472961426, + "logits/rejected": 1.4743976593017578, + "logps/chosen": -574.7918090820312, + "logps/rejected": -693.0616455078125, + "loss": 0.5114, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.179802417755127, + "rewards/margins": 1.2648974657058716, + "rewards/rejected": -4.444699764251709, + "step": 3400 + }, + { + "epoch": 0.44, + "eval_logits/chosen": 1.2128088474273682, + "eval_logits/rejected": 2.149277687072754, + "eval_logps/chosen": -548.863037109375, + "eval_logps/rejected": -652.9094848632812, + "eval_loss": 0.5196763277053833, + "eval_rewards/accuracies": 0.7254999876022339, + "eval_rewards/chosen": -2.8424201011657715, + "eval_rewards/margins": 1.2408852577209473, + "eval_rewards/rejected": -4.083305835723877, + "eval_runtime": 1591.6948, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 3400 + }, + { + "epoch": 0.45, + "grad_norm": 18.25, + "learning_rate": 3.386512217606339e-06, + "logits/chosen": 0.46127867698669434, + "logits/rejected": 1.181891679763794, + "logps/chosen": -542.9649658203125, + "logps/rejected": -631.12451171875, + "loss": 0.5582, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.793233633041382, + "rewards/margins": 0.9561799764633179, + "rewards/rejected": -3.7494137287139893, + "step": 3410 + }, + { + "epoch": 0.45, + "grad_norm": 16.5, + "learning_rate": 3.375824510343816e-06, + "logits/chosen": 0.9180284738540649, + "logits/rejected": 1.5742998123168945, + "logps/chosen": -563.2110595703125, + "logps/rejected": -746.85205078125, + "loss": 0.3963, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2570576667785645, + "rewards/margins": 1.4821968078613281, + "rewards/rejected": -4.739253997802734, + "step": 3420 + }, + { + "epoch": 0.45, + "grad_norm": 18.25, + "learning_rate": 3.3651185255289466e-06, + "logits/chosen": 0.5649822354316711, + "logits/rejected": 1.8119010925292969, + "logps/chosen": -595.4826049804688, + "logps/rejected": -661.2381591796875, + "loss": 0.507, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.104897975921631, + "rewards/margins": 1.009965419769287, + "rewards/rejected": -4.114863395690918, + "step": 3430 + }, + { + "epoch": 0.45, + "grad_norm": 22.625, + "learning_rate": 3.354394486584568e-06, + "logits/chosen": 0.7015897035598755, + "logits/rejected": 1.8448234796524048, + "logps/chosen": -575.3528442382812, + "logps/rejected": -682.8203735351562, + "loss": 0.5305, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.233482837677002, + "rewards/margins": 1.1560642719268799, + "rewards/rejected": -4.389547348022461, + "step": 3440 + }, + { + "epoch": 0.45, + "grad_norm": 17.75, + "learning_rate": 3.3436526173102913e-06, + "logits/chosen": 1.5318998098373413, + "logits/rejected": 2.359741687774658, + "logps/chosen": -564.7638549804688, + "logps/rejected": -700.0146484375, + "loss": 0.4162, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8777577877044678, + "rewards/margins": 1.642392873764038, + "rewards/rejected": -4.520151138305664, + "step": 3450 + }, + { + "epoch": 0.45, + "grad_norm": 7.6875, + "learning_rate": 3.3328931418778254e-06, + "logits/chosen": 1.0667393207550049, + "logits/rejected": 1.512853741645813, + "logps/chosen": -573.0630493164062, + "logps/rejected": -682.3682250976562, + "loss": 0.5208, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.0130012035369873, + "rewards/margins": 1.110229253768921, + "rewards/rejected": -4.123230457305908, + "step": 3460 + }, + { + "epoch": 0.45, + "grad_norm": 13.875, + "learning_rate": 3.3221162848263028e-06, + "logits/chosen": 0.6153064966201782, + "logits/rejected": 2.2130637168884277, + "logps/chosen": -586.8394165039062, + "logps/rejected": -665.9385986328125, + "loss": 0.502, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8460216522216797, + "rewards/margins": 1.336303472518921, + "rewards/rejected": -4.18232536315918, + "step": 3470 + }, + { + "epoch": 0.46, + "grad_norm": 17.5, + "learning_rate": 3.3113222710575914e-06, + "logits/chosen": 1.2034531831741333, + "logits/rejected": 2.5035030841827393, + "logps/chosen": -584.3963623046875, + "logps/rejected": -676.0372314453125, + "loss": 0.4895, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.894857168197632, + "rewards/margins": 1.2428711652755737, + "rewards/rejected": -4.137728214263916, + "step": 3480 + }, + { + "epoch": 0.46, + "grad_norm": 21.625, + "learning_rate": 3.300511325831603e-06, + "logits/chosen": 0.7528663873672485, + "logits/rejected": 1.6426197290420532, + "logps/chosen": -545.1719360351562, + "logps/rejected": -673.9959716796875, + "loss": 0.4629, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.682860851287842, + "rewards/margins": 1.4254639148712158, + "rewards/rejected": -4.1083245277404785, + "step": 3490 + }, + { + "epoch": 0.46, + "grad_norm": 14.75, + "learning_rate": 3.289683674761592e-06, + "logits/chosen": 1.3696619272232056, + "logits/rejected": 2.1932106018066406, + "logps/chosen": -555.2965087890625, + "logps/rejected": -658.5980834960938, + "loss": 0.4984, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1395516395568848, + "rewards/margins": 1.1953315734863281, + "rewards/rejected": -4.334883213043213, + "step": 3500 + }, + { + "epoch": 0.46, + "eval_logits/chosen": 2.4203193187713623, + "eval_logits/rejected": 3.3501927852630615, + "eval_logps/chosen": -584.5863647460938, + "eval_logps/rejected": -686.7951049804688, + "eval_loss": 0.500175416469574, + "eval_rewards/accuracies": 0.7450000047683716, + "eval_rewards/chosen": -3.199653148651123, + "eval_rewards/margins": 1.222508430480957, + "eval_rewards/rejected": -4.42216157913208, + "eval_runtime": 1591.0902, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 3500 + }, + { + "epoch": 0.46, + "grad_norm": 10.1875, + "learning_rate": 3.2788395438094444e-06, + "logits/chosen": 1.513866662979126, + "logits/rejected": 2.8263657093048096, + "logps/chosen": -604.82861328125, + "logps/rejected": -735.3837890625, + "loss": 0.4415, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.161881923675537, + "rewards/margins": 1.5348942279815674, + "rewards/rejected": -4.696776390075684, + "step": 3510 + }, + { + "epoch": 0.46, + "grad_norm": 12.3125, + "learning_rate": 3.2679791592809653e-06, + "logits/chosen": 1.3661248683929443, + "logits/rejected": 2.9377036094665527, + "logps/chosen": -630.15966796875, + "logps/rejected": -705.7320556640625, + "loss": 0.4701, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.424785614013672, + "rewards/margins": 1.3289605379104614, + "rewards/rejected": -4.753746032714844, + "step": 3520 + }, + { + "epoch": 0.46, + "grad_norm": 20.75, + "learning_rate": 3.257102747821157e-06, + "logits/chosen": 1.612074851989746, + "logits/rejected": 2.272280216217041, + "logps/chosen": -618.0646362304688, + "logps/rejected": -717.951171875, + "loss": 0.5457, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.4815406799316406, + "rewards/margins": 1.1168197393417358, + "rewards/rejected": -4.598360538482666, + "step": 3530 + }, + { + "epoch": 0.46, + "grad_norm": 5.90625, + "learning_rate": 3.246210536409484e-06, + "logits/chosen": 0.9772621989250183, + "logits/rejected": 2.2998509407043457, + "logps/chosen": -603.9741821289062, + "logps/rejected": -677.609130859375, + "loss": 0.4589, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1230061054229736, + "rewards/margins": 1.463616132736206, + "rewards/rejected": -4.586622714996338, + "step": 3540 + }, + { + "epoch": 0.46, + "grad_norm": 10.5625, + "learning_rate": 3.235302752355142e-06, + "logits/chosen": 1.1950008869171143, + "logits/rejected": 2.352081775665283, + "logps/chosen": -600.2168579101562, + "logps/rejected": -651.6273803710938, + "loss": 0.5152, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2299892902374268, + "rewards/margins": 1.007690191268921, + "rewards/rejected": -4.237679481506348, + "step": 3550 + }, + { + "epoch": 0.47, + "grad_norm": 17.25, + "learning_rate": 3.2243796232923097e-06, + "logits/chosen": 1.0240675210952759, + "logits/rejected": 2.123300790786743, + "logps/chosen": -528.0433349609375, + "logps/rejected": -596.7354736328125, + "loss": 0.4909, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9449267387390137, + "rewards/margins": 1.1938796043395996, + "rewards/rejected": -4.138806343078613, + "step": 3560 + }, + { + "epoch": 0.47, + "grad_norm": 13.0, + "learning_rate": 3.2134413771754037e-06, + "logits/chosen": 1.0020049810409546, + "logits/rejected": 2.169445037841797, + "logps/chosen": -583.1009521484375, + "logps/rejected": -682.7677612304688, + "loss": 0.4336, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9823849201202393, + "rewards/margins": 1.2904002666473389, + "rewards/rejected": -4.272785186767578, + "step": 3570 + }, + { + "epoch": 0.47, + "grad_norm": 8.625, + "learning_rate": 3.2024882422743118e-06, + "logits/chosen": 1.328412652015686, + "logits/rejected": 2.1645195484161377, + "logps/chosen": -493.51678466796875, + "logps/rejected": -633.2418212890625, + "loss": 0.3906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7480642795562744, + "rewards/margins": 1.3832073211669922, + "rewards/rejected": -4.1312713623046875, + "step": 3580 + }, + { + "epoch": 0.47, + "grad_norm": 12.3125, + "learning_rate": 3.1915204471696425e-06, + "logits/chosen": 1.0298570394515991, + "logits/rejected": 2.2715506553649902, + "logps/chosen": -526.4378662109375, + "logps/rejected": -669.9403076171875, + "loss": 0.4011, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6348226070404053, + "rewards/margins": 1.5454890727996826, + "rewards/rejected": -4.180312156677246, + "step": 3590 + }, + { + "epoch": 0.47, + "grad_norm": 7.3125, + "learning_rate": 3.180538220747943e-06, + "logits/chosen": 1.3050510883331299, + "logits/rejected": 1.3333051204681396, + "logps/chosen": -555.6622924804688, + "logps/rejected": -714.8551635742188, + "loss": 0.5723, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.9709713459014893, + "rewards/margins": 1.0748671293258667, + "rewards/rejected": -4.045838832855225, + "step": 3600 + }, + { + "epoch": 0.47, + "eval_logits/chosen": 2.2597780227661133, + "eval_logits/rejected": 3.1533713340759277, + "eval_logps/chosen": -565.27490234375, + "eval_logps/rejected": -668.9721069335938, + "eval_loss": 0.5010019540786743, + "eval_rewards/accuracies": 0.7409999966621399, + "eval_rewards/chosen": -3.006538152694702, + "eval_rewards/margins": 1.2373936176300049, + "eval_rewards/rejected": -4.243931770324707, + "eval_runtime": 1613.3695, + "eval_samples_per_second": 1.24, + "eval_steps_per_second": 0.31, + "step": 3600 + }, + { + "epoch": 0.47, + "grad_norm": 16.125, + "learning_rate": 3.1695417921969287e-06, + "logits/chosen": 1.2470731735229492, + "logits/rejected": 2.3240132331848145, + "logps/chosen": -545.6837768554688, + "logps/rejected": -646.6546020507812, + "loss": 0.4794, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9675731658935547, + "rewards/margins": 1.2627222537994385, + "rewards/rejected": -4.230295181274414, + "step": 3610 + }, + { + "epoch": 0.47, + "grad_norm": 14.0, + "learning_rate": 3.158531391000697e-06, + "logits/chosen": 0.9274897575378418, + "logits/rejected": 1.4380440711975098, + "logps/chosen": -529.5089721679688, + "logps/rejected": -630.5634155273438, + "loss": 0.6304, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.034899950027466, + "rewards/margins": 0.9231551289558411, + "rewards/rejected": -3.958055019378662, + "step": 3620 + }, + { + "epoch": 0.48, + "grad_norm": 33.25, + "learning_rate": 3.147507246934943e-06, + "logits/chosen": 0.8136155009269714, + "logits/rejected": 1.3876383304595947, + "logps/chosen": -548.38232421875, + "logps/rejected": -674.1826782226562, + "loss": 0.4906, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.774062395095825, + "rewards/margins": 1.34238600730896, + "rewards/rejected": -4.116448402404785, + "step": 3630 + }, + { + "epoch": 0.48, + "grad_norm": 14.875, + "learning_rate": 3.136469590062158e-06, + "logits/chosen": 0.4745601713657379, + "logits/rejected": 1.5338537693023682, + "logps/chosen": -516.0549926757812, + "logps/rejected": -564.0617065429688, + "loss": 0.4777, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.353170156478882, + "rewards/margins": 1.0227165222167969, + "rewards/rejected": -3.3758864402770996, + "step": 3640 + }, + { + "epoch": 0.48, + "grad_norm": 17.0, + "learning_rate": 3.1254186507268354e-06, + "logits/chosen": 0.6987334489822388, + "logits/rejected": 1.4793400764465332, + "logps/chosen": -501.6717224121094, + "logps/rejected": -627.906982421875, + "loss": 0.3542, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.213365077972412, + "rewards/margins": 1.541622281074524, + "rewards/rejected": -3.7549872398376465, + "step": 3650 + }, + { + "epoch": 0.48, + "grad_norm": 20.375, + "learning_rate": 3.114354659550656e-06, + "logits/chosen": 0.7804635167121887, + "logits/rejected": 2.090153217315674, + "logps/chosen": -529.9520263671875, + "logps/rejected": -633.1751098632812, + "loss": 0.4671, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8481204509735107, + "rewards/margins": 1.3539659976959229, + "rewards/rejected": -4.202086448669434, + "step": 3660 + }, + { + "epoch": 0.48, + "grad_norm": 15.75, + "learning_rate": 3.1032778474276816e-06, + "logits/chosen": 1.4736740589141846, + "logits/rejected": 2.5067548751831055, + "logps/chosen": -538.311767578125, + "logps/rejected": -713.0725708007812, + "loss": 0.4175, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.9481959342956543, + "rewards/margins": 1.6900793313980103, + "rewards/rejected": -4.638275146484375, + "step": 3670 + }, + { + "epoch": 0.48, + "grad_norm": 6.46875, + "learning_rate": 3.092188445519532e-06, + "logits/chosen": 1.6656243801116943, + "logits/rejected": 3.1111550331115723, + "logps/chosen": -513.4066162109375, + "logps/rejected": -629.7359619140625, + "loss": 0.4822, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9789786338806152, + "rewards/margins": 1.4785501956939697, + "rewards/rejected": -4.457529544830322, + "step": 3680 + }, + { + "epoch": 0.48, + "grad_norm": 12.9375, + "learning_rate": 3.081086685250565e-06, + "logits/chosen": 1.1992745399475098, + "logits/rejected": 2.9142909049987793, + "logps/chosen": -667.6442260742188, + "logps/rejected": -707.6119384765625, + "loss": 0.4743, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.285987138748169, + "rewards/margins": 1.1857128143310547, + "rewards/rejected": -4.4716997146606445, + "step": 3690 + }, + { + "epoch": 0.48, + "grad_norm": 16.25, + "learning_rate": 3.0699727983030434e-06, + "logits/chosen": 1.3589823246002197, + "logits/rejected": 2.0958337783813477, + "logps/chosen": -599.2689208984375, + "logps/rejected": -723.6403198242188, + "loss": 0.5496, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.401282787322998, + "rewards/margins": 1.1826626062393188, + "rewards/rejected": -4.5839457511901855, + "step": 3700 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 2.4471547603607178, + "eval_logits/rejected": 3.31201171875, + "eval_logps/chosen": -570.4303588867188, + "eval_logps/rejected": -677.9390869140625, + "eval_loss": 0.5015013813972473, + "eval_rewards/accuracies": 0.7394999861717224, + "eval_rewards/chosen": -3.058093786239624, + "eval_rewards/margins": 1.2755075693130493, + "eval_rewards/rejected": -4.333600997924805, + "eval_runtime": 1591.8744, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 3700 + }, + { + "epoch": 0.49, + "grad_norm": 23.0, + "learning_rate": 3.058847016612301e-06, + "logits/chosen": 1.3370041847229004, + "logits/rejected": 2.4886152744293213, + "logps/chosen": -585.2352294921875, + "logps/rejected": -648.9256591796875, + "loss": 0.5683, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.335144519805908, + "rewards/margins": 1.0732650756835938, + "rewards/rejected": -4.40841007232666, + "step": 3710 + }, + { + "epoch": 0.49, + "grad_norm": 11.375, + "learning_rate": 3.0477095723619034e-06, + "logits/chosen": 1.6717097759246826, + "logits/rejected": 2.2256245613098145, + "logps/chosen": -526.5162963867188, + "logps/rejected": -641.79638671875, + "loss": 0.4822, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7065768241882324, + "rewards/margins": 1.269716501235962, + "rewards/rejected": -3.9762935638427734, + "step": 3720 + }, + { + "epoch": 0.49, + "grad_norm": 20.5, + "learning_rate": 3.0365606979788003e-06, + "logits/chosen": 1.271213173866272, + "logits/rejected": 2.268070936203003, + "logps/chosen": -607.2047119140625, + "logps/rejected": -735.3359375, + "loss": 0.4284, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.146824836730957, + "rewards/margins": 1.4212199449539185, + "rewards/rejected": -4.568044662475586, + "step": 3730 + }, + { + "epoch": 0.49, + "grad_norm": 15.9375, + "learning_rate": 3.0254006261284786e-06, + "logits/chosen": 1.67838454246521, + "logits/rejected": 2.5358495712280273, + "logps/chosen": -625.5853881835938, + "logps/rejected": -768.9443969726562, + "loss": 0.4578, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5043373107910156, + "rewards/margins": 1.5824673175811768, + "rewards/rejected": -5.086804389953613, + "step": 3740 + }, + { + "epoch": 0.49, + "grad_norm": 17.625, + "learning_rate": 3.0142295897101032e-06, + "logits/chosen": 1.197113275527954, + "logits/rejected": 1.7569679021835327, + "logps/chosen": -588.1669311523438, + "logps/rejected": -715.07568359375, + "loss": 0.4599, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.412364959716797, + "rewards/margins": 1.3579638004302979, + "rewards/rejected": -4.770328521728516, + "step": 3750 + }, + { + "epoch": 0.49, + "grad_norm": 8.25, + "learning_rate": 3.0030478218516578e-06, + "logits/chosen": 0.9988697171211243, + "logits/rejected": 2.3596878051757812, + "logps/chosen": -614.2236328125, + "logps/rejected": -727.8824462890625, + "loss": 0.4997, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.321038007736206, + "rewards/margins": 1.4818624258041382, + "rewards/rejected": -4.802900791168213, + "step": 3760 + }, + { + "epoch": 0.49, + "grad_norm": 6.4375, + "learning_rate": 2.9918555559050826e-06, + "logits/chosen": 0.6846061944961548, + "logits/rejected": 1.5343295335769653, + "logps/chosen": -570.2298583984375, + "logps/rejected": -632.1084594726562, + "loss": 0.5979, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.1014671325683594, + "rewards/margins": 1.0553886890411377, + "rewards/rejected": -4.156855583190918, + "step": 3770 + }, + { + "epoch": 0.49, + "grad_norm": 15.3125, + "learning_rate": 2.980653025441399e-06, + "logits/chosen": 1.6924612522125244, + "logits/rejected": 2.326639175415039, + "logps/chosen": -585.7066650390625, + "logps/rejected": -714.9122924804688, + "loss": 0.4739, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2248730659484863, + "rewards/margins": 1.3962581157684326, + "rewards/rejected": -4.621131420135498, + "step": 3780 + }, + { + "epoch": 0.5, + "grad_norm": 5.8125, + "learning_rate": 2.969440464245841e-06, + "logits/chosen": 1.1217143535614014, + "logits/rejected": 2.3194773197174072, + "logps/chosen": -560.1303100585938, + "logps/rejected": -688.375732421875, + "loss": 0.5052, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2385811805725098, + "rewards/margins": 1.344367504119873, + "rewards/rejected": -4.582947731018066, + "step": 3790 + }, + { + "epoch": 0.5, + "grad_norm": 23.0, + "learning_rate": 2.95821810631297e-06, + "logits/chosen": 1.2779474258422852, + "logits/rejected": 2.844780445098877, + "logps/chosen": -663.8018798828125, + "logps/rejected": -746.4864501953125, + "loss": 0.5106, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.8018813133239746, + "rewards/margins": 1.2830824851989746, + "rewards/rejected": -5.084963798522949, + "step": 3800 + }, + { + "epoch": 0.5, + "eval_logits/chosen": 1.8546652793884277, + "eval_logits/rejected": 2.713374614715576, + "eval_logps/chosen": -615.3915405273438, + "eval_logps/rejected": -726.6728515625, + "eval_loss": 0.5012689232826233, + "eval_rewards/accuracies": 0.7394999861717224, + "eval_rewards/chosen": -3.507704973220825, + "eval_rewards/margins": 1.3132338523864746, + "eval_rewards/rejected": -4.820939064025879, + "eval_runtime": 1592.0058, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 3800 + }, + { + "epoch": 0.5, + "grad_norm": 31.625, + "learning_rate": 2.946986185841801e-06, + "logits/chosen": 0.9378841519355774, + "logits/rejected": 1.745422124862671, + "logps/chosen": -627.324462890625, + "logps/rejected": -722.0845947265625, + "loss": 0.4964, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.6624836921691895, + "rewards/margins": 1.261169672012329, + "rewards/rejected": -4.923653602600098, + "step": 3810 + }, + { + "epoch": 0.5, + "grad_norm": 12.75, + "learning_rate": 2.935744937230903e-06, + "logits/chosen": 1.396041750907898, + "logits/rejected": 1.7260946035385132, + "logps/chosen": -622.05126953125, + "logps/rejected": -763.8098754882812, + "loss": 0.568, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.608628749847412, + "rewards/margins": 1.256679892539978, + "rewards/rejected": -4.8653082847595215, + "step": 3820 + }, + { + "epoch": 0.5, + "grad_norm": 8.8125, + "learning_rate": 2.924494595073517e-06, + "logits/chosen": 0.6841916441917419, + "logits/rejected": 1.274814248085022, + "logps/chosen": -572.808349609375, + "logps/rejected": -694.9969482421875, + "loss": 0.482, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.484985828399658, + "rewards/margins": 1.2095296382904053, + "rewards/rejected": -4.694515228271484, + "step": 3830 + }, + { + "epoch": 0.5, + "grad_norm": 13.1875, + "learning_rate": 2.9132353941526575e-06, + "logits/chosen": 0.7297332286834717, + "logits/rejected": 1.7951176166534424, + "logps/chosen": -656.1531372070312, + "logps/rejected": -790.7908325195312, + "loss": 0.4268, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.7499594688415527, + "rewards/margins": 1.6107057332992554, + "rewards/rejected": -5.360665321350098, + "step": 3840 + }, + { + "epoch": 0.5, + "grad_norm": 16.25, + "learning_rate": 2.901967569436209e-06, + "logits/chosen": 0.5449053049087524, + "logits/rejected": 1.2905735969543457, + "logps/chosen": -650.99853515625, + "logps/rejected": -721.5765380859375, + "loss": 0.543, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8416759967803955, + "rewards/margins": 1.0251166820526123, + "rewards/rejected": -4.86679220199585, + "step": 3850 + }, + { + "epoch": 0.51, + "grad_norm": 39.0, + "learning_rate": 2.89069135607203e-06, + "logits/chosen": 0.5232194662094116, + "logits/rejected": 1.2776695489883423, + "logps/chosen": -597.3203735351562, + "logps/rejected": -697.5950927734375, + "loss": 0.5231, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.524958848953247, + "rewards/margins": 1.080949306488037, + "rewards/rejected": -4.605908393859863, + "step": 3860 + }, + { + "epoch": 0.51, + "grad_norm": 15.6875, + "learning_rate": 2.8794069893830386e-06, + "logits/chosen": 0.9736347198486328, + "logits/rejected": 2.3645083904266357, + "logps/chosen": -596.4930419921875, + "logps/rejected": -694.7677001953125, + "loss": 0.5365, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.4399502277374268, + "rewards/margins": 1.128644347190857, + "rewards/rejected": -4.568594932556152, + "step": 3870 + }, + { + "epoch": 0.51, + "grad_norm": 8.4375, + "learning_rate": 2.8681147048623038e-06, + "logits/chosen": 0.4126865863800049, + "logits/rejected": 1.3872052431106567, + "logps/chosen": -606.4537353515625, + "logps/rejected": -725.9848022460938, + "loss": 0.405, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.983593225479126, + "rewards/margins": 1.3947454690933228, + "rewards/rejected": -4.378338813781738, + "step": 3880 + }, + { + "epoch": 0.51, + "grad_norm": 27.5, + "learning_rate": 2.8568147381681333e-06, + "logits/chosen": 0.9017072916030884, + "logits/rejected": 1.5411750078201294, + "logps/chosen": -546.4932250976562, + "logps/rejected": -644.0993041992188, + "loss": 0.6068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0471713542938232, + "rewards/margins": 1.2161060571670532, + "rewards/rejected": -4.263277530670166, + "step": 3890 + }, + { + "epoch": 0.51, + "grad_norm": 11.5, + "learning_rate": 2.8455073251191533e-06, + "logits/chosen": 0.5226460695266724, + "logits/rejected": 1.6768258810043335, + "logps/chosen": -612.2659912109375, + "logps/rejected": -740.1510620117188, + "loss": 0.376, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.243638515472412, + "rewards/margins": 1.695278525352478, + "rewards/rejected": -4.938917636871338, + "step": 3900 + }, + { + "epoch": 0.51, + "eval_logits/chosen": 1.9628366231918335, + "eval_logits/rejected": 2.773855447769165, + "eval_logps/chosen": -590.9802856445312, + "eval_logps/rejected": -697.17529296875, + "eval_loss": 0.4994959235191345, + "eval_rewards/accuracies": 0.7415000200271606, + "eval_rewards/chosen": -3.263592481613159, + "eval_rewards/margins": 1.2623705863952637, + "eval_rewards/rejected": -4.525962829589844, + "eval_runtime": 1593.0408, + "eval_samples_per_second": 1.255, + "eval_steps_per_second": 0.314, + "step": 3900 + }, + { + "epoch": 0.51, + "grad_norm": 8.625, + "learning_rate": 2.8341927016893887e-06, + "logits/chosen": 1.4300041198730469, + "logits/rejected": 1.2578006982803345, + "logps/chosen": -591.4417114257812, + "logps/rejected": -711.3917236328125, + "loss": 0.4788, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3117687702178955, + "rewards/margins": 1.2131303548812866, + "rewards/rejected": -4.524899482727051, + "step": 3910 + }, + { + "epoch": 0.51, + "grad_norm": 14.0, + "learning_rate": 2.822871104003335e-06, + "logits/chosen": 0.9081200361251831, + "logits/rejected": 2.4838366508483887, + "logps/chosen": -589.5599365234375, + "logps/rejected": -662.2095947265625, + "loss": 0.4594, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.023688316345215, + "rewards/margins": 1.5029232501983643, + "rewards/rejected": -4.526611328125, + "step": 3920 + }, + { + "epoch": 0.51, + "grad_norm": 12.875, + "learning_rate": 2.8115427683310355e-06, + "logits/chosen": 0.47768887877464294, + "logits/rejected": 1.4875624179840088, + "logps/chosen": -587.0402221679688, + "logps/rejected": -677.3700561523438, + "loss": 0.5306, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.184080123901367, + "rewards/margins": 1.084136962890625, + "rewards/rejected": -4.268217086791992, + "step": 3930 + }, + { + "epoch": 0.52, + "grad_norm": 30.875, + "learning_rate": 2.8002079310831477e-06, + "logits/chosen": 0.6028070449829102, + "logits/rejected": 1.0062382221221924, + "logps/chosen": -555.0243530273438, + "logps/rejected": -614.6267700195312, + "loss": 0.65, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.950934410095215, + "rewards/margins": 0.8431466817855835, + "rewards/rejected": -3.794081211090088, + "step": 3940 + }, + { + "epoch": 0.52, + "grad_norm": 5.6875, + "learning_rate": 2.7888668288060095e-06, + "logits/chosen": 0.39542144536972046, + "logits/rejected": 1.2938110828399658, + "logps/chosen": -533.0115356445312, + "logps/rejected": -613.7713623046875, + "loss": 0.5404, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.655400514602661, + "rewards/margins": 1.038409948348999, + "rewards/rejected": -3.6938107013702393, + "step": 3950 + }, + { + "epoch": 0.52, + "grad_norm": 12.75, + "learning_rate": 2.7775196981767044e-06, + "logits/chosen": 0.7035880088806152, + "logits/rejected": 1.4570562839508057, + "logps/chosen": -535.3646240234375, + "logps/rejected": -627.9224853515625, + "loss": 0.5444, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.855009078979492, + "rewards/margins": 1.1241127252578735, + "rewards/rejected": -3.9791221618652344, + "step": 3960 + }, + { + "epoch": 0.52, + "grad_norm": 20.0, + "learning_rate": 2.7661667759981213e-06, + "logits/chosen": 0.7251302003860474, + "logits/rejected": 1.8517974615097046, + "logps/chosen": -567.191162109375, + "logps/rejected": -618.9002685546875, + "loss": 0.6056, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1272902488708496, + "rewards/margins": 0.8452841639518738, + "rewards/rejected": -3.9725747108459473, + "step": 3970 + }, + { + "epoch": 0.52, + "grad_norm": 38.5, + "learning_rate": 2.7548082991940137e-06, + "logits/chosen": 0.8394457101821899, + "logits/rejected": 1.7973182201385498, + "logps/chosen": -553.3853149414062, + "logps/rejected": -612.0332641601562, + "loss": 0.6223, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0196127891540527, + "rewards/margins": 0.9670158624649048, + "rewards/rejected": -3.986628770828247, + "step": 3980 + }, + { + "epoch": 0.52, + "grad_norm": 9.4375, + "learning_rate": 2.743444504804051e-06, + "logits/chosen": 0.6090790033340454, + "logits/rejected": 1.5463039875030518, + "logps/chosen": -559.8428955078125, + "logps/rejected": -631.6338500976562, + "loss": 0.4786, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7995877265930176, + "rewards/margins": 1.0208765268325806, + "rewards/rejected": -3.8204643726348877, + "step": 3990 + }, + { + "epoch": 0.52, + "grad_norm": 17.125, + "learning_rate": 2.7320756299788788e-06, + "logits/chosen": 0.40987616777420044, + "logits/rejected": 1.202165961265564, + "logps/chosen": -534.3167724609375, + "logps/rejected": -659.1214599609375, + "loss": 0.4935, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.807413101196289, + "rewards/margins": 1.016972303390503, + "rewards/rejected": -3.824385404586792, + "step": 4000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": 1.5516222715377808, + "eval_logits/rejected": 2.2899205684661865, + "eval_logps/chosen": -547.131103515625, + "eval_logps/rejected": -640.8605346679688, + "eval_loss": 0.4915643334388733, + "eval_rewards/accuracies": 0.7465000152587891, + "eval_rewards/chosen": -2.825101137161255, + "eval_rewards/margins": 1.137715458869934, + "eval_rewards/rejected": -3.9628164768218994, + "eval_runtime": 1592.3492, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 4000 + }, + { + "epoch": 0.52, + "grad_norm": 13.125, + "learning_rate": 2.7207019119751644e-06, + "logits/chosen": 0.3096240162849426, + "logits/rejected": 1.2950459718704224, + "logps/chosen": -519.6134643554688, + "logps/rejected": -585.9755859375, + "loss": 0.4758, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6585440635681152, + "rewards/margins": 1.1354074478149414, + "rewards/rejected": -3.7939515113830566, + "step": 4010 + }, + { + "epoch": 0.53, + "grad_norm": 33.25, + "learning_rate": 2.7093235881506474e-06, + "logits/chosen": 1.2475144863128662, + "logits/rejected": 1.9860883951187134, + "logps/chosen": -569.3560791015625, + "logps/rejected": -669.8253784179688, + "loss": 0.4712, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9248297214508057, + "rewards/margins": 1.4279091358184814, + "rewards/rejected": -4.352738857269287, + "step": 4020 + }, + { + "epoch": 0.53, + "grad_norm": 14.375, + "learning_rate": 2.6979408959591863e-06, + "logits/chosen": 0.5688766241073608, + "logits/rejected": 0.8400389552116394, + "logps/chosen": -561.5079956054688, + "logps/rejected": -659.046875, + "loss": 0.6009, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.01423716545105, + "rewards/margins": 0.922036349773407, + "rewards/rejected": -3.9362735748291016, + "step": 4030 + }, + { + "epoch": 0.53, + "grad_norm": 17.375, + "learning_rate": 2.6865540729458034e-06, + "logits/chosen": 0.10123654454946518, + "logits/rejected": 0.8616682887077332, + "logps/chosen": -576.2913208007812, + "logps/rejected": -641.7877807617188, + "loss": 0.5364, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.760798931121826, + "rewards/margins": 1.0089768171310425, + "rewards/rejected": -3.769775867462158, + "step": 4040 + }, + { + "epoch": 0.53, + "grad_norm": 10.4375, + "learning_rate": 2.675163356741726e-06, + "logits/chosen": 0.42007890343666077, + "logits/rejected": 1.0813418626785278, + "logps/chosen": -470.1661682128906, + "logps/rejected": -569.1080932617188, + "loss": 0.4399, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5839219093322754, + "rewards/margins": 1.1666027307510376, + "rewards/rejected": -3.7505240440368652, + "step": 4050 + }, + { + "epoch": 0.53, + "grad_norm": 8.625, + "learning_rate": 2.6637689850594285e-06, + "logits/chosen": 0.4460233747959137, + "logits/rejected": 1.1718910932540894, + "logps/chosen": -530.4000854492188, + "logps/rejected": -595.0211181640625, + "loss": 0.5224, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.648142099380493, + "rewards/margins": 0.9859753847122192, + "rewards/rejected": -3.634117603302002, + "step": 4060 + }, + { + "epoch": 0.53, + "grad_norm": 14.5, + "learning_rate": 2.652371195687671e-06, + "logits/chosen": 0.7797343730926514, + "logits/rejected": 1.4408290386199951, + "logps/chosen": -500.3291015625, + "logps/rejected": -628.8976440429688, + "loss": 0.4065, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4375433921813965, + "rewards/margins": 1.5079452991485596, + "rewards/rejected": -3.945488691329956, + "step": 4070 + }, + { + "epoch": 0.53, + "grad_norm": 33.5, + "learning_rate": 2.64097022648654e-06, + "logits/chosen": 0.49426335096359253, + "logits/rejected": 1.7455838918685913, + "logps/chosen": -594.4205932617188, + "logps/rejected": -676.2381591796875, + "loss": 0.5492, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9470136165618896, + "rewards/margins": 1.3631705045700073, + "rewards/rejected": -4.310183525085449, + "step": 4080 + }, + { + "epoch": 0.54, + "grad_norm": 12.6875, + "learning_rate": 2.6295663153824774e-06, + "logits/chosen": 0.4955861568450928, + "logits/rejected": 1.726845145225525, + "logps/chosen": -572.8784790039062, + "logps/rejected": -633.9725952148438, + "loss": 0.5038, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8294196128845215, + "rewards/margins": 1.260789394378662, + "rewards/rejected": -4.090209007263184, + "step": 4090 + }, + { + "epoch": 0.54, + "grad_norm": 12.5625, + "learning_rate": 2.6181597003633218e-06, + "logits/chosen": 0.7141835689544678, + "logits/rejected": 2.0009679794311523, + "logps/chosen": -566.5924072265625, + "logps/rejected": -663.8424072265625, + "loss": 0.445, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8427367210388184, + "rewards/margins": 1.4427145719528198, + "rewards/rejected": -4.2854509353637695, + "step": 4100 + }, + { + "epoch": 0.54, + "eval_logits/chosen": 1.8263081312179565, + "eval_logits/rejected": 2.594874858856201, + "eval_logps/chosen": -577.61767578125, + "eval_logps/rejected": -685.20458984375, + "eval_loss": 0.4958656132221222, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -3.1299664974212646, + "eval_rewards/margins": 1.276289939880371, + "eval_rewards/rejected": -4.406256198883057, + "eval_runtime": 1592.5081, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 4100 + }, + { + "epoch": 0.54, + "grad_norm": 8.6875, + "learning_rate": 2.606750619473342e-06, + "logits/chosen": 1.2048364877700806, + "logits/rejected": 1.8250070810317993, + "logps/chosen": -559.253662109375, + "logps/rejected": -670.3594970703125, + "loss": 0.465, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.046375036239624, + "rewards/margins": 1.1393458843231201, + "rewards/rejected": -4.185720920562744, + "step": 4110 + }, + { + "epoch": 0.54, + "grad_norm": 19.25, + "learning_rate": 2.595339310808262e-06, + "logits/chosen": 0.7069088220596313, + "logits/rejected": 1.2920167446136475, + "logps/chosen": -541.710205078125, + "logps/rejected": -664.6094360351562, + "loss": 0.3956, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.00246524810791, + "rewards/margins": 1.356275200843811, + "rewards/rejected": -4.358740329742432, + "step": 4120 + }, + { + "epoch": 0.54, + "grad_norm": 26.125, + "learning_rate": 2.5839260125103004e-06, + "logits/chosen": 0.9028911590576172, + "logits/rejected": 1.8872220516204834, + "logps/chosen": -589.1178588867188, + "logps/rejected": -703.8515625, + "loss": 0.428, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1477065086364746, + "rewards/margins": 1.3748151063919067, + "rewards/rejected": -4.522521495819092, + "step": 4130 + }, + { + "epoch": 0.54, + "grad_norm": 24.25, + "learning_rate": 2.5725109627631984e-06, + "logits/chosen": 0.8007787466049194, + "logits/rejected": 1.5194114446640015, + "logps/chosen": -595.8224487304688, + "logps/rejected": -692.4710693359375, + "loss": 0.4624, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.188469409942627, + "rewards/margins": 1.274657964706421, + "rewards/rejected": -4.463127613067627, + "step": 4140 + }, + { + "epoch": 0.54, + "grad_norm": 47.5, + "learning_rate": 2.5610943997872443e-06, + "logits/chosen": 1.1802794933319092, + "logits/rejected": 1.5942678451538086, + "logps/chosen": -628.5667114257812, + "logps/rejected": -740.0662231445312, + "loss": 0.5069, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.7526698112487793, + "rewards/margins": 1.1907706260681152, + "rewards/rejected": -4.9434404373168945, + "step": 4150 + }, + { + "epoch": 0.54, + "grad_norm": 22.125, + "learning_rate": 2.5496765618343096e-06, + "logits/chosen": 0.9174972772598267, + "logits/rejected": 1.2704370021820068, + "logps/chosen": -620.0501098632812, + "logps/rejected": -710.2423095703125, + "loss": 0.6713, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.6227316856384277, + "rewards/margins": 0.9411904215812683, + "rewards/rejected": -4.563921928405762, + "step": 4160 + }, + { + "epoch": 0.55, + "grad_norm": 13.5, + "learning_rate": 2.538257687182871e-06, + "logits/chosen": 1.4342944622039795, + "logits/rejected": 2.462162494659424, + "logps/chosen": -576.3062744140625, + "logps/rejected": -697.0335693359375, + "loss": 0.5501, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.4480528831481934, + "rewards/margins": 1.26388680934906, + "rewards/rejected": -4.711940288543701, + "step": 4170 + }, + { + "epoch": 0.55, + "grad_norm": 32.5, + "learning_rate": 2.526838014133041e-06, + "logits/chosen": 1.0441315174102783, + "logits/rejected": 1.7808837890625, + "logps/chosen": -587.8073120117188, + "logps/rejected": -674.4149169921875, + "loss": 0.5703, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.2182915210723877, + "rewards/margins": 1.070989966392517, + "rewards/rejected": -4.289281368255615, + "step": 4180 + }, + { + "epoch": 0.55, + "grad_norm": 10.25, + "learning_rate": 2.515417781001594e-06, + "logits/chosen": 0.7826089859008789, + "logits/rejected": 1.5784668922424316, + "logps/chosen": -499.166015625, + "logps/rejected": -613.2916259765625, + "loss": 0.4251, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.5489115715026855, + "rewards/margins": 1.3632811307907104, + "rewards/rejected": -3.9121928215026855, + "step": 4190 + }, + { + "epoch": 0.55, + "grad_norm": 8.9375, + "learning_rate": 2.503997226116992e-06, + "logits/chosen": 1.1384754180908203, + "logits/rejected": 1.1739859580993652, + "logps/chosen": -487.3101501464844, + "logps/rejected": -636.6060180664062, + "loss": 0.443, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4144322872161865, + "rewards/margins": 1.3946157693862915, + "rewards/rejected": -3.8090476989746094, + "step": 4200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": 1.7637161016464233, + "eval_logits/rejected": 2.5643436908721924, + "eval_logps/chosen": -525.6652221679688, + "eval_logps/rejected": -636.2509765625, + "eval_loss": 0.5038745999336243, + "eval_rewards/accuracies": 0.734499990940094, + "eval_rewards/chosen": -2.6104423999786377, + "eval_rewards/margins": 1.306278109550476, + "eval_rewards/rejected": -3.916720390319824, + "eval_runtime": 1591.7784, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 4200 + }, + { + "epoch": 0.55, + "grad_norm": 13.3125, + "learning_rate": 2.4925765878144115e-06, + "logits/chosen": 1.3933148384094238, + "logits/rejected": 2.002098560333252, + "logps/chosen": -489.69842529296875, + "logps/rejected": -591.0333862304688, + "loss": 0.5167, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4099645614624023, + "rewards/margins": 1.2759312391281128, + "rewards/rejected": -3.6858959197998047, + "step": 4210 + }, + { + "epoch": 0.55, + "grad_norm": 23.375, + "learning_rate": 2.4811561044307727e-06, + "logits/chosen": 0.9644983410835266, + "logits/rejected": 1.7500193119049072, + "logps/chosen": -487.23187255859375, + "logps/rejected": -627.9241943359375, + "loss": 0.4398, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5000617504119873, + "rewards/margins": 1.471921682357788, + "rewards/rejected": -3.9719836711883545, + "step": 4220 + }, + { + "epoch": 0.55, + "grad_norm": 16.125, + "learning_rate": 2.469736014299758e-06, + "logits/chosen": 0.8705085515975952, + "logits/rejected": 1.7094383239746094, + "logps/chosen": -528.7936401367188, + "logps/rejected": -636.2086181640625, + "loss": 0.411, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6657662391662598, + "rewards/margins": 1.3999571800231934, + "rewards/rejected": -4.065723419189453, + "step": 4230 + }, + { + "epoch": 0.55, + "grad_norm": 17.75, + "learning_rate": 2.458316555746846e-06, + "logits/chosen": 1.1141247749328613, + "logits/rejected": 1.7977077960968018, + "logps/chosen": -582.32958984375, + "logps/rejected": -719.8034057617188, + "loss": 0.5544, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.231219530105591, + "rewards/margins": 1.3708423376083374, + "rewards/rejected": -4.6020612716674805, + "step": 4240 + }, + { + "epoch": 0.56, + "grad_norm": 14.875, + "learning_rate": 2.446897967084334e-06, + "logits/chosen": 1.4410032033920288, + "logits/rejected": 2.145479679107666, + "logps/chosen": -590.9288940429688, + "logps/rejected": -746.0863037109375, + "loss": 0.4088, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.4452788829803467, + "rewards/margins": 1.6681804656982422, + "rewards/rejected": -5.113459587097168, + "step": 4250 + }, + { + "epoch": 0.56, + "grad_norm": 26.5, + "learning_rate": 2.4354804866063684e-06, + "logits/chosen": 1.1707557439804077, + "logits/rejected": 1.9762599468231201, + "logps/chosen": -612.4578857421875, + "logps/rejected": -760.2510986328125, + "loss": 0.3818, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.5203795433044434, + "rewards/margins": 1.672025442123413, + "rewards/rejected": -5.1924052238464355, + "step": 4260 + }, + { + "epoch": 0.56, + "grad_norm": 16.25, + "learning_rate": 2.424064352583964e-06, + "logits/chosen": 1.2009286880493164, + "logits/rejected": 2.1041266918182373, + "logps/chosen": -564.6832275390625, + "logps/rejected": -678.2476806640625, + "loss": 0.4951, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.084103584289551, + "rewards/margins": 1.2054665088653564, + "rewards/rejected": -4.28956937789917, + "step": 4270 + }, + { + "epoch": 0.56, + "grad_norm": 8.1875, + "learning_rate": 2.4126498032600403e-06, + "logits/chosen": 0.9922968149185181, + "logits/rejected": 1.713650107383728, + "logps/chosen": -616.302734375, + "logps/rejected": -700.6239624023438, + "loss": 0.5001, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2124435901641846, + "rewards/margins": 1.3036648035049438, + "rewards/rejected": -4.51610803604126, + "step": 4280 + }, + { + "epoch": 0.56, + "grad_norm": 22.625, + "learning_rate": 2.401237076844445e-06, + "logits/chosen": 1.120624303817749, + "logits/rejected": 1.7306255102157593, + "logps/chosen": -510.9451599121094, + "logps/rejected": -606.6317749023438, + "loss": 0.5167, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5382046699523926, + "rewards/margins": 1.3079121112823486, + "rewards/rejected": -3.8461170196533203, + "step": 4290 + }, + { + "epoch": 0.56, + "grad_norm": 18.375, + "learning_rate": 2.38982641150898e-06, + "logits/chosen": 1.0842640399932861, + "logits/rejected": 1.4181780815124512, + "logps/chosen": -489.26507568359375, + "logps/rejected": -598.4462280273438, + "loss": 0.517, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7224583625793457, + "rewards/margins": 1.2289758920669556, + "rewards/rejected": -3.9514336585998535, + "step": 4300 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.854472279548645, + "eval_logits/rejected": 2.621220588684082, + "eval_logps/chosen": -570.7054443359375, + "eval_logps/rejected": -689.4329833984375, + "eval_loss": 0.5042153596878052, + "eval_rewards/accuracies": 0.737500011920929, + "eval_rewards/chosen": -3.0608439445495605, + "eval_rewards/margins": 1.3876967430114746, + "eval_rewards/rejected": -4.448540687561035, + "eval_runtime": 1592.3251, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 4300 + }, + { + "epoch": 0.56, + "grad_norm": 19.5, + "learning_rate": 2.3784180453824414e-06, + "logits/chosen": 1.0778725147247314, + "logits/rejected": 2.368303060531616, + "logps/chosen": -607.5311889648438, + "logps/rejected": -722.4293823242188, + "loss": 0.4607, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.2945079803466797, + "rewards/margins": 1.4340566396713257, + "rewards/rejected": -4.728564739227295, + "step": 4310 + }, + { + "epoch": 0.57, + "grad_norm": 17.0, + "learning_rate": 2.367012216545638e-06, + "logits/chosen": 0.9011770486831665, + "logits/rejected": 1.6108005046844482, + "logps/chosen": -679.2977905273438, + "logps/rejected": -808.9659423828125, + "loss": 0.4776, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.7852683067321777, + "rewards/margins": 1.5607531070709229, + "rewards/rejected": -5.34602165222168, + "step": 4320 + }, + { + "epoch": 0.57, + "grad_norm": 23.875, + "learning_rate": 2.3556091630264294e-06, + "logits/chosen": 1.6703239679336548, + "logits/rejected": 2.2463738918304443, + "logps/chosen": -645.5777587890625, + "logps/rejected": -776.7327270507812, + "loss": 0.4236, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.643219470977783, + "rewards/margins": 1.4833171367645264, + "rewards/rejected": -5.1265363693237305, + "step": 4330 + }, + { + "epoch": 0.57, + "grad_norm": 21.375, + "learning_rate": 2.344209122794757e-06, + "logits/chosen": 1.0503277778625488, + "logits/rejected": 2.3379592895507812, + "logps/chosen": -627.8720703125, + "logps/rejected": -737.0346069335938, + "loss": 0.485, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.7017154693603516, + "rewards/margins": 1.5395132303237915, + "rewards/rejected": -5.241229057312012, + "step": 4340 + }, + { + "epoch": 0.57, + "grad_norm": 14.3125, + "learning_rate": 2.3328123337576787e-06, + "logits/chosen": 1.0765559673309326, + "logits/rejected": 1.8498785495758057, + "logps/chosen": -631.2354125976562, + "logps/rejected": -735.3282470703125, + "loss": 0.5706, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.5931332111358643, + "rewards/margins": 1.1502000093460083, + "rewards/rejected": -4.743332862854004, + "step": 4350 + }, + { + "epoch": 0.57, + "grad_norm": 15.125, + "learning_rate": 2.3214190337544017e-06, + "logits/chosen": 0.7766814827919006, + "logits/rejected": 1.5930789709091187, + "logps/chosen": -567.65576171875, + "logps/rejected": -674.5923461914062, + "loss": 0.5605, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2872467041015625, + "rewards/margins": 1.0767381191253662, + "rewards/rejected": -4.363985061645508, + "step": 4360 + }, + { + "epoch": 0.57, + "grad_norm": 14.1875, + "learning_rate": 2.310029460551323e-06, + "logits/chosen": 0.7130604982376099, + "logits/rejected": 1.5312939882278442, + "logps/chosen": -561.925537109375, + "logps/rejected": -691.440673828125, + "loss": 0.4895, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.86440372467041, + "rewards/margins": 1.4014947414398193, + "rewards/rejected": -4.265898704528809, + "step": 4370 + }, + { + "epoch": 0.57, + "grad_norm": 11.625, + "learning_rate": 2.2986438518370645e-06, + "logits/chosen": 0.2056771069765091, + "logits/rejected": 1.5509610176086426, + "logps/chosen": -534.5150146484375, + "logps/rejected": -645.1749267578125, + "loss": 0.3948, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.609005928039551, + "rewards/margins": 1.5315120220184326, + "rewards/rejected": -4.140517711639404, + "step": 4380 + }, + { + "epoch": 0.57, + "grad_norm": 10.1875, + "learning_rate": 2.2872624452175123e-06, + "logits/chosen": 0.7365008592605591, + "logits/rejected": 1.443713903427124, + "logps/chosen": -478.4400329589844, + "logps/rejected": -601.012451171875, + "loss": 0.4489, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5473170280456543, + "rewards/margins": 1.3967925310134888, + "rewards/rejected": -3.9441094398498535, + "step": 4390 + }, + { + "epoch": 0.58, + "grad_norm": 12.125, + "learning_rate": 2.2758854782108584e-06, + "logits/chosen": 0.8305877447128296, + "logits/rejected": 1.501903772354126, + "logps/chosen": -498.0873107910156, + "logps/rejected": -665.9342041015625, + "loss": 0.3693, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.7014124393463135, + "rewards/margins": 1.7031335830688477, + "rewards/rejected": -4.404545783996582, + "step": 4400 + }, + { + "epoch": 0.58, + "eval_logits/chosen": 1.8050862550735474, + "eval_logits/rejected": 2.517800807952881, + "eval_logps/chosen": -591.6002197265625, + "eval_logps/rejected": -700.556396484375, + "eval_loss": 0.4968615472316742, + "eval_rewards/accuracies": 0.746999979019165, + "eval_rewards/chosen": -3.2697925567626953, + "eval_rewards/margins": 1.2899818420410156, + "eval_rewards/rejected": -4.559774875640869, + "eval_runtime": 1591.5803, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 4400 + }, + { + "epoch": 0.58, + "grad_norm": 15.5, + "learning_rate": 2.2645131882426458e-06, + "logits/chosen": 0.9317380785942078, + "logits/rejected": 1.8292909860610962, + "logps/chosen": -587.1124267578125, + "logps/rejected": -757.9627685546875, + "loss": 0.4073, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.214625835418701, + "rewards/margins": 1.690768837928772, + "rewards/rejected": -4.905394554138184, + "step": 4410 + }, + { + "epoch": 0.58, + "grad_norm": 9.9375, + "learning_rate": 2.2531458126408154e-06, + "logits/chosen": 1.728948950767517, + "logits/rejected": 1.7796413898468018, + "logps/chosen": -652.2147216796875, + "logps/rejected": -797.2843627929688, + "loss": 0.5817, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8443686962127686, + "rewards/margins": 1.2194750308990479, + "rewards/rejected": -5.063844203948975, + "step": 4420 + }, + { + "epoch": 0.58, + "grad_norm": 20.375, + "learning_rate": 2.2417835886307452e-06, + "logits/chosen": 1.2968741655349731, + "logits/rejected": 1.9664971828460693, + "logps/chosen": -571.7467041015625, + "logps/rejected": -706.6888427734375, + "loss": 0.4778, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.410456895828247, + "rewards/margins": 1.3251036405563354, + "rewards/rejected": -4.735560417175293, + "step": 4430 + }, + { + "epoch": 0.58, + "grad_norm": 11.0625, + "learning_rate": 2.2304267533303075e-06, + "logits/chosen": 1.0025638341903687, + "logits/rejected": 2.462754011154175, + "logps/chosen": -603.9998168945312, + "logps/rejected": -628.0054321289062, + "loss": 0.517, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.4204928874969482, + "rewards/margins": 1.0212152004241943, + "rewards/rejected": -4.441708087921143, + "step": 4440 + }, + { + "epoch": 0.58, + "grad_norm": 15.8125, + "learning_rate": 2.219075543744918e-06, + "logits/chosen": 1.2630012035369873, + "logits/rejected": 1.9615631103515625, + "logps/chosen": -565.0428466796875, + "logps/rejected": -648.239013671875, + "loss": 0.6442, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.4224343299865723, + "rewards/margins": 0.932254433631897, + "rewards/rejected": -4.354689121246338, + "step": 4450 + }, + { + "epoch": 0.58, + "grad_norm": 23.75, + "learning_rate": 2.207730196762589e-06, + "logits/chosen": 0.7253775596618652, + "logits/rejected": 0.9075161814689636, + "logps/chosen": -567.9874267578125, + "logps/rejected": -690.0647583007812, + "loss": 0.4821, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9466922283172607, + "rewards/margins": 1.2171391248703003, + "rewards/rejected": -4.163830757141113, + "step": 4460 + }, + { + "epoch": 0.58, + "grad_norm": 17.375, + "learning_rate": 2.1963909491489846e-06, + "logits/chosen": 0.6194896697998047, + "logits/rejected": 1.25758957862854, + "logps/chosen": -557.9462890625, + "logps/rejected": -642.5169677734375, + "loss": 0.4892, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9200026988983154, + "rewards/margins": 1.0755693912506104, + "rewards/rejected": -3.9955718517303467, + "step": 4470 + }, + { + "epoch": 0.59, + "grad_norm": 12.1875, + "learning_rate": 2.185058037542486e-06, + "logits/chosen": 0.48488321900367737, + "logits/rejected": 0.8963411450386047, + "logps/chosen": -525.1871948242188, + "logps/rejected": -625.0621948242188, + "loss": 0.5098, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7714436054229736, + "rewards/margins": 1.1916885375976562, + "rewards/rejected": -3.963132381439209, + "step": 4480 + }, + { + "epoch": 0.59, + "grad_norm": 17.25, + "learning_rate": 2.173731698449244e-06, + "logits/chosen": 0.5312266945838928, + "logits/rejected": 1.2149841785430908, + "logps/chosen": -601.7564697265625, + "logps/rejected": -717.5197143554688, + "loss": 0.4883, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1194474697113037, + "rewards/margins": 1.1115381717681885, + "rewards/rejected": -4.230985641479492, + "step": 4490 + }, + { + "epoch": 0.59, + "grad_norm": 14.875, + "learning_rate": 2.1624121682382495e-06, + "logits/chosen": 0.9723777770996094, + "logits/rejected": 1.461883783340454, + "logps/chosen": -538.8240966796875, + "logps/rejected": -631.4890747070312, + "loss": 0.481, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7736334800720215, + "rewards/margins": 1.180018663406372, + "rewards/rejected": -3.9536521434783936, + "step": 4500 + }, + { + "epoch": 0.59, + "eval_logits/chosen": 1.3648182153701782, + "eval_logits/rejected": 2.0328919887542725, + "eval_logps/chosen": -545.3853149414062, + "eval_logps/rejected": -640.7147827148438, + "eval_loss": 0.48925405740737915, + "eval_rewards/accuracies": 0.7444999814033508, + "eval_rewards/chosen": -2.807643175125122, + "eval_rewards/margins": 1.1537160873413086, + "eval_rewards/rejected": -3.9613587856292725, + "eval_runtime": 1591.7001, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 4500 + }, + { + "epoch": 0.59, + "grad_norm": 12.375, + "learning_rate": 2.1510996831363993e-06, + "logits/chosen": 0.13078482449054718, + "logits/rejected": 1.3275985717773438, + "logps/chosen": -543.0574340820312, + "logps/rejected": -632.9281616210938, + "loss": 0.4247, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.831533908843994, + "rewards/margins": 1.3064507246017456, + "rewards/rejected": -4.137984275817871, + "step": 4510 + }, + { + "epoch": 0.59, + "grad_norm": 9.375, + "learning_rate": 2.139794479223565e-06, + "logits/chosen": 0.3153165578842163, + "logits/rejected": 1.1119401454925537, + "logps/chosen": -571.6793823242188, + "logps/rejected": -646.7637939453125, + "loss": 0.4975, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9295685291290283, + "rewards/margins": 1.1337769031524658, + "rewards/rejected": -4.063345432281494, + "step": 4520 + }, + { + "epoch": 0.59, + "grad_norm": 15.25, + "learning_rate": 2.128496792427669e-06, + "logits/chosen": 0.5227512121200562, + "logits/rejected": 1.2176361083984375, + "logps/chosen": -530.2662353515625, + "logps/rejected": -652.4630126953125, + "loss": 0.4378, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.744332790374756, + "rewards/margins": 1.3382190465927124, + "rewards/rejected": -4.082551956176758, + "step": 4530 + }, + { + "epoch": 0.59, + "grad_norm": 19.875, + "learning_rate": 2.117206858519758e-06, + "logits/chosen": 1.0087593793869019, + "logits/rejected": 1.6051127910614014, + "logps/chosen": -527.2972412109375, + "logps/rejected": -705.387451171875, + "loss": 0.399, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8391306400299072, + "rewards/margins": 1.7159910202026367, + "rewards/rejected": -4.555121421813965, + "step": 4540 + }, + { + "epoch": 0.6, + "grad_norm": 21.25, + "learning_rate": 2.1059249131090844e-06, + "logits/chosen": 0.5082886219024658, + "logits/rejected": 1.4257456064224243, + "logps/chosen": -575.4322509765625, + "logps/rejected": -634.91845703125, + "loss": 0.5462, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.027864456176758, + "rewards/margins": 1.0270512104034424, + "rewards/rejected": -4.054915428161621, + "step": 4550 + }, + { + "epoch": 0.6, + "grad_norm": 21.5, + "learning_rate": 2.094651191638189e-06, + "logits/chosen": 0.4393271803855896, + "logits/rejected": 1.1420228481292725, + "logps/chosen": -534.1982421875, + "logps/rejected": -623.5012817382812, + "loss": 0.522, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.76639986038208, + "rewards/margins": 1.1037254333496094, + "rewards/rejected": -3.8701255321502686, + "step": 4560 + }, + { + "epoch": 0.6, + "grad_norm": 38.25, + "learning_rate": 2.0833859293779867e-06, + "logits/chosen": 1.1747925281524658, + "logits/rejected": 1.879563570022583, + "logps/chosen": -539.6349487304688, + "logps/rejected": -643.480712890625, + "loss": 0.5247, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.930851697921753, + "rewards/margins": 1.1933619976043701, + "rewards/rejected": -4.124213218688965, + "step": 4570 + }, + { + "epoch": 0.6, + "grad_norm": 32.0, + "learning_rate": 2.0721293614228568e-06, + "logits/chosen": 1.0092575550079346, + "logits/rejected": 1.8891198635101318, + "logps/chosen": -550.1498413085938, + "logps/rejected": -662.275634765625, + "loss": 0.5383, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.221567153930664, + "rewards/margins": 1.1925783157348633, + "rewards/rejected": -4.414145469665527, + "step": 4580 + }, + { + "epoch": 0.6, + "grad_norm": 17.25, + "learning_rate": 2.060881722685742e-06, + "logits/chosen": 1.0075985193252563, + "logits/rejected": 1.9096540212631226, + "logps/chosen": -592.787353515625, + "logps/rejected": -713.7469482421875, + "loss": 0.4931, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3248486518859863, + "rewards/margins": 1.3827643394470215, + "rewards/rejected": -4.70761251449585, + "step": 4590 + }, + { + "epoch": 0.6, + "grad_norm": 12.9375, + "learning_rate": 2.049643247893235e-06, + "logits/chosen": 1.0936925411224365, + "logits/rejected": 1.6271178722381592, + "logps/chosen": -523.0247802734375, + "logps/rejected": -647.8020629882812, + "loss": 0.4696, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0114166736602783, + "rewards/margins": 1.296616792678833, + "rewards/rejected": -4.308033466339111, + "step": 4600 + }, + { + "epoch": 0.6, + "eval_logits/chosen": 1.940106987953186, + "eval_logits/rejected": 2.6733126640319824, + "eval_logps/chosen": -598.3125, + "eval_logps/rejected": -704.406494140625, + "eval_loss": 0.4945114850997925, + "eval_rewards/accuracies": 0.7465000152587891, + "eval_rewards/chosen": -3.3369147777557373, + "eval_rewards/margins": 1.261361002922058, + "eval_rewards/rejected": -4.598275661468506, + "eval_runtime": 1591.7121, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 4600 + }, + { + "epoch": 0.6, + "grad_norm": 11.25, + "learning_rate": 2.0384141715806903e-06, + "logits/chosen": 0.8319090008735657, + "logits/rejected": 1.6114375591278076, + "logps/chosen": -647.0298461914062, + "logps/rejected": -705.9091186523438, + "loss": 0.5693, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.422800064086914, + "rewards/margins": 0.8977988958358765, + "rewards/rejected": -4.320598602294922, + "step": 4610 + }, + { + "epoch": 0.6, + "grad_norm": 14.1875, + "learning_rate": 2.0271947280873255e-06, + "logits/chosen": 0.5834625959396362, + "logits/rejected": 1.2262613773345947, + "logps/chosen": -615.0538330078125, + "logps/rejected": -690.6492919921875, + "loss": 0.4892, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2901012897491455, + "rewards/margins": 0.9858062863349915, + "rewards/rejected": -4.275907039642334, + "step": 4620 + }, + { + "epoch": 0.61, + "grad_norm": 9.625, + "learning_rate": 2.0159851515513302e-06, + "logits/chosen": 0.6103604435920715, + "logits/rejected": 1.439798355102539, + "logps/chosen": -625.5350341796875, + "logps/rejected": -733.4219360351562, + "loss": 0.4369, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8628792762756348, + "rewards/margins": 1.389877200126648, + "rewards/rejected": -4.252756595611572, + "step": 4630 + }, + { + "epoch": 0.61, + "grad_norm": 14.125, + "learning_rate": 2.004785675904982e-06, + "logits/chosen": 1.1228773593902588, + "logits/rejected": 1.6704814434051514, + "logps/chosen": -550.8262939453125, + "logps/rejected": -630.0404052734375, + "loss": 0.5813, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.9327898025512695, + "rewards/margins": 0.9519754648208618, + "rewards/rejected": -3.884765148162842, + "step": 4640 + }, + { + "epoch": 0.61, + "grad_norm": 8.25, + "learning_rate": 1.9935965348697624e-06, + "logits/chosen": 0.7240558862686157, + "logits/rejected": 0.9415324330329895, + "logps/chosen": -542.6046142578125, + "logps/rejected": -712.421142578125, + "loss": 0.3923, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.919755220413208, + "rewards/margins": 1.4323351383209229, + "rewards/rejected": -4.352089881896973, + "step": 4650 + }, + { + "epoch": 0.61, + "grad_norm": 28.0, + "learning_rate": 1.9824179619514807e-06, + "logits/chosen": 0.8619499206542969, + "logits/rejected": 1.7239103317260742, + "logps/chosen": -523.3920288085938, + "logps/rejected": -648.755126953125, + "loss": 0.4401, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8888051509857178, + "rewards/margins": 1.20088791847229, + "rewards/rejected": -4.089693069458008, + "step": 4660 + }, + { + "epoch": 0.61, + "grad_norm": 17.125, + "learning_rate": 1.9712501904354004e-06, + "logits/chosen": 1.0945820808410645, + "logits/rejected": 1.327915072441101, + "logps/chosen": -570.0567016601562, + "logps/rejected": -677.2376708984375, + "loss": 0.5679, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.9967522621154785, + "rewards/margins": 0.9667608141899109, + "rewards/rejected": -3.963512420654297, + "step": 4670 + }, + { + "epoch": 0.61, + "grad_norm": 13.25, + "learning_rate": 1.960093453381369e-06, + "logits/chosen": 1.1339690685272217, + "logits/rejected": 2.105699062347412, + "logps/chosen": -542.7904663085938, + "logps/rejected": -632.7266845703125, + "loss": 0.4646, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0150303840637207, + "rewards/margins": 1.2327334880828857, + "rewards/rejected": -4.2477641105651855, + "step": 4680 + }, + { + "epoch": 0.61, + "grad_norm": 20.5, + "learning_rate": 1.948947983618962e-06, + "logits/chosen": 0.6128058433532715, + "logits/rejected": 1.3241350650787354, + "logps/chosen": -590.0193481445312, + "logps/rejected": -701.9865112304688, + "loss": 0.4458, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.8744754791259766, + "rewards/margins": 1.3885873556137085, + "rewards/rejected": -4.263062953948975, + "step": 4690 + }, + { + "epoch": 0.62, + "grad_norm": 9.9375, + "learning_rate": 1.937814013742611e-06, + "logits/chosen": 0.673147976398468, + "logits/rejected": 1.3856855630874634, + "logps/chosen": -479.01611328125, + "logps/rejected": -614.7681884765625, + "loss": 0.4437, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6545872688293457, + "rewards/margins": 1.4752554893493652, + "rewards/rejected": -4.129843235015869, + "step": 4700 + }, + { + "epoch": 0.62, + "eval_logits/chosen": 1.2695993185043335, + "eval_logits/rejected": 2.05474853515625, + "eval_logps/chosen": -545.9228515625, + "eval_logps/rejected": -653.1788330078125, + "eval_loss": 0.4940186142921448, + "eval_rewards/accuracies": 0.7444999814033508, + "eval_rewards/chosen": -2.813018321990967, + "eval_rewards/margins": 1.2729805707931519, + "eval_rewards/rejected": -4.085999488830566, + "eval_runtime": 1591.5189, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 4700 + }, + { + "epoch": 0.62, + "grad_norm": 15.375, + "learning_rate": 1.9266917761067617e-06, + "logits/chosen": 0.7281503677368164, + "logits/rejected": 1.7401161193847656, + "logps/chosen": -542.7756958007812, + "logps/rejected": -641.5377807617188, + "loss": 0.5409, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9858946800231934, + "rewards/margins": 1.0609354972839355, + "rewards/rejected": -4.046829700469971, + "step": 4710 + }, + { + "epoch": 0.62, + "grad_norm": 13.125, + "learning_rate": 1.915581502821017e-06, + "logits/chosen": 0.4619746804237366, + "logits/rejected": 1.4797730445861816, + "logps/chosen": -514.7449340820312, + "logps/rejected": -664.6442260742188, + "loss": 0.4787, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9499011039733887, + "rewards/margins": 1.4481924772262573, + "rewards/rejected": -4.398093223571777, + "step": 4720 + }, + { + "epoch": 0.62, + "grad_norm": 23.875, + "learning_rate": 1.9044834257452997e-06, + "logits/chosen": 0.49240007996559143, + "logits/rejected": 1.0757876634597778, + "logps/chosen": -588.3216552734375, + "logps/rejected": -722.0255126953125, + "loss": 0.6945, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.253741502761841, + "rewards/margins": 1.194266676902771, + "rewards/rejected": -4.448008060455322, + "step": 4730 + }, + { + "epoch": 0.62, + "grad_norm": 13.0, + "learning_rate": 1.893397776485006e-06, + "logits/chosen": 0.3050432801246643, + "logits/rejected": 1.475989580154419, + "logps/chosen": -574.0492553710938, + "logps/rejected": -703.4495849609375, + "loss": 0.4284, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9173645973205566, + "rewards/margins": 1.7435029745101929, + "rewards/rejected": -4.660867691040039, + "step": 4740 + }, + { + "epoch": 0.62, + "grad_norm": 10.3125, + "learning_rate": 1.8823247863861804e-06, + "logits/chosen": 0.5984830856323242, + "logits/rejected": 1.160872459411621, + "logps/chosen": -553.18701171875, + "logps/rejected": -664.4539184570312, + "loss": 0.5087, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7396388053894043, + "rewards/margins": 0.9977794885635376, + "rewards/rejected": -3.7374184131622314, + "step": 4750 + }, + { + "epoch": 0.62, + "grad_norm": 25.25, + "learning_rate": 1.8712646865306822e-06, + "logits/chosen": 0.5051913857460022, + "logits/rejected": 1.2967312335968018, + "logps/chosen": -573.8558959960938, + "logps/rejected": -653.6807861328125, + "loss": 0.5435, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8586268424987793, + "rewards/margins": 1.1963424682617188, + "rewards/rejected": -4.05496883392334, + "step": 4760 + }, + { + "epoch": 0.62, + "grad_norm": 11.375, + "learning_rate": 1.8602177077313631e-06, + "logits/chosen": 0.6085312962532043, + "logits/rejected": 1.7123191356658936, + "logps/chosen": -513.794189453125, + "logps/rejected": -625.562255859375, + "loss": 0.4743, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6487550735473633, + "rewards/margins": 1.3480517864227295, + "rewards/rejected": -3.996807098388672, + "step": 4770 + }, + { + "epoch": 0.63, + "grad_norm": 10.1875, + "learning_rate": 1.8491840805272546e-06, + "logits/chosen": 0.7332175374031067, + "logits/rejected": 1.728525161743164, + "logps/chosen": -543.1729736328125, + "logps/rejected": -671.0867919921875, + "loss": 0.5058, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8513693809509277, + "rewards/margins": 1.2838191986083984, + "rewards/rejected": -4.135189056396484, + "step": 4780 + }, + { + "epoch": 0.63, + "grad_norm": 20.875, + "learning_rate": 1.8381640351787516e-06, + "logits/chosen": 0.6986968517303467, + "logits/rejected": 1.7445147037506104, + "logps/chosen": -590.0587768554688, + "logps/rejected": -642.3038330078125, + "loss": 0.5453, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.819589138031006, + "rewards/margins": 0.9962056279182434, + "rewards/rejected": -3.8157947063446045, + "step": 4790 + }, + { + "epoch": 0.63, + "grad_norm": 32.0, + "learning_rate": 1.8271578016628122e-06, + "logits/chosen": 1.1015398502349854, + "logits/rejected": 1.974747657775879, + "logps/chosen": -560.9954223632812, + "logps/rejected": -675.0828247070312, + "loss": 0.4492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9392876625061035, + "rewards/margins": 1.3561334609985352, + "rewards/rejected": -4.2954206466674805, + "step": 4800 + }, + { + "epoch": 0.63, + "eval_logits/chosen": 1.535466194152832, + "eval_logits/rejected": 2.339323043823242, + "eval_logps/chosen": -541.89599609375, + "eval_logps/rejected": -651.1524047851562, + "eval_loss": 0.4963241219520569, + "eval_rewards/accuracies": 0.7465000152587891, + "eval_rewards/chosen": -2.772749900817871, + "eval_rewards/margins": 1.292984962463379, + "eval_rewards/rejected": -4.065734386444092, + "eval_runtime": 1592.0484, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 4800 + }, + { + "epoch": 0.63, + "grad_norm": 16.625, + "learning_rate": 1.8161656096681546e-06, + "logits/chosen": 0.5773103833198547, + "logits/rejected": 1.7860008478164673, + "logps/chosen": -491.50469970703125, + "logps/rejected": -664.9474487304688, + "loss": 0.4972, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8516623973846436, + "rewards/margins": 1.4962561130523682, + "rewards/rejected": -4.347918510437012, + "step": 4810 + }, + { + "epoch": 0.63, + "grad_norm": 13.375, + "learning_rate": 1.8051876885904645e-06, + "logits/chosen": 0.4815793037414551, + "logits/rejected": 1.3946386575698853, + "logps/chosen": -519.1699829101562, + "logps/rejected": -677.970703125, + "loss": 0.3626, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5333261489868164, + "rewards/margins": 1.677099585533142, + "rewards/rejected": -4.21042537689209, + "step": 4820 + }, + { + "epoch": 0.63, + "grad_norm": 14.375, + "learning_rate": 1.7942242675276098e-06, + "logits/chosen": 1.3374228477478027, + "logits/rejected": 1.18678879737854, + "logps/chosen": -529.24560546875, + "logps/rejected": -640.7904052734375, + "loss": 0.5131, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8369922637939453, + "rewards/margins": 1.2228530645370483, + "rewards/rejected": -4.059844970703125, + "step": 4830 + }, + { + "epoch": 0.63, + "grad_norm": 9.9375, + "learning_rate": 1.783275575274856e-06, + "logits/chosen": 0.8641239404678345, + "logits/rejected": 1.1198005676269531, + "logps/chosen": -567.7559814453125, + "logps/rejected": -662.1914672851562, + "loss": 0.5847, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.185612440109253, + "rewards/margins": 1.0657835006713867, + "rewards/rejected": -4.251395225524902, + "step": 4840 + }, + { + "epoch": 0.63, + "grad_norm": 20.625, + "learning_rate": 1.7723418403200943e-06, + "logits/chosen": 0.7207244038581848, + "logits/rejected": 1.9345829486846924, + "logps/chosen": -624.4559936523438, + "logps/rejected": -741.6597290039062, + "loss": 0.4056, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3388404846191406, + "rewards/margins": 1.618586778640747, + "rewards/rejected": -4.95742654800415, + "step": 4850 + }, + { + "epoch": 0.64, + "grad_norm": 15.8125, + "learning_rate": 1.7614232908390748e-06, + "logits/chosen": 0.21066589653491974, + "logits/rejected": 1.2674895524978638, + "logps/chosen": -590.947998046875, + "logps/rejected": -709.72998046875, + "loss": 0.5172, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2923660278320312, + "rewards/margins": 1.3251597881317139, + "rewards/rejected": -4.617525577545166, + "step": 4860 + }, + { + "epoch": 0.64, + "grad_norm": 17.625, + "learning_rate": 1.7505201546906398e-06, + "logits/chosen": 0.9584872126579285, + "logits/rejected": 2.0030558109283447, + "logps/chosen": -533.1852416992188, + "logps/rejected": -680.3133544921875, + "loss": 0.3596, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0049993991851807, + "rewards/margins": 1.8188884258270264, + "rewards/rejected": -4.823887825012207, + "step": 4870 + }, + { + "epoch": 0.64, + "grad_norm": 8.875, + "learning_rate": 1.7396326594119717e-06, + "logits/chosen": 0.8650108575820923, + "logits/rejected": 1.0653468370437622, + "logps/chosen": -557.3784790039062, + "logps/rejected": -722.6024169921875, + "loss": 0.405, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.025503396987915, + "rewards/margins": 1.6017059087753296, + "rewards/rejected": -4.627209663391113, + "step": 4880 + }, + { + "epoch": 0.64, + "grad_norm": 6.53125, + "learning_rate": 1.7287610322138449e-06, + "logits/chosen": 0.7085530161857605, + "logits/rejected": 1.709802269935608, + "logps/chosen": -580.3453979492188, + "logps/rejected": -651.3648681640625, + "loss": 0.55, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1891696453094482, + "rewards/margins": 1.127560019493103, + "rewards/rejected": -4.316729545593262, + "step": 4890 + }, + { + "epoch": 0.64, + "grad_norm": 26.125, + "learning_rate": 1.7179054999758817e-06, + "logits/chosen": 0.18700245022773743, + "logits/rejected": 1.1381707191467285, + "logps/chosen": -652.1790161132812, + "logps/rejected": -758.5651245117188, + "loss": 0.5163, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.5247459411621094, + "rewards/margins": 1.343051552772522, + "rewards/rejected": -4.867797374725342, + "step": 4900 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.2216382026672363, + "eval_logits/rejected": 2.0201332569122314, + "eval_logps/chosen": -599.6018676757812, + "eval_logps/rejected": -721.0643310546875, + "eval_loss": 0.501664936542511, + "eval_rewards/accuracies": 0.7465000152587891, + "eval_rewards/chosen": -3.3498075008392334, + "eval_rewards/margins": 1.4150458574295044, + "eval_rewards/rejected": -4.7648539543151855, + "eval_runtime": 1591.3817, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 4900 + }, + { + "epoch": 0.64, + "grad_norm": 12.4375, + "learning_rate": 1.7070662892418225e-06, + "logits/chosen": 0.6338080167770386, + "logits/rejected": 2.215214490890503, + "logps/chosen": -604.2860717773438, + "logps/rejected": -725.52392578125, + "loss": 0.4728, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.482276201248169, + "rewards/margins": 1.4508180618286133, + "rewards/rejected": -4.933094501495361, + "step": 4910 + }, + { + "epoch": 0.64, + "grad_norm": 23.0, + "learning_rate": 1.6962436262147913e-06, + "logits/chosen": 0.46580928564071655, + "logits/rejected": 1.9383723735809326, + "logps/chosen": -662.3033447265625, + "logps/rejected": -741.284912109375, + "loss": 0.5509, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.588061571121216, + "rewards/margins": 1.350954294204712, + "rewards/rejected": -4.939015865325928, + "step": 4920 + }, + { + "epoch": 0.65, + "grad_norm": 38.75, + "learning_rate": 1.6854377367525814e-06, + "logits/chosen": 0.31709781289100647, + "logits/rejected": 1.1077455282211304, + "logps/chosen": -584.2669067382812, + "logps/rejected": -666.3541870117188, + "loss": 0.5443, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.418966293334961, + "rewards/margins": 1.1538652181625366, + "rewards/rejected": -4.572831153869629, + "step": 4930 + }, + { + "epoch": 0.65, + "grad_norm": 9.0625, + "learning_rate": 1.6746488463629362e-06, + "logits/chosen": 0.04629017040133476, + "logits/rejected": 1.4084089994430542, + "logps/chosen": -565.3751220703125, + "logps/rejected": -666.2435302734375, + "loss": 0.4557, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.0415847301483154, + "rewards/margins": 1.346897840499878, + "rewards/rejected": -4.388482570648193, + "step": 4940 + }, + { + "epoch": 0.65, + "grad_norm": 13.4375, + "learning_rate": 1.6638771801988483e-06, + "logits/chosen": 0.6879199147224426, + "logits/rejected": 1.345462441444397, + "logps/chosen": -615.6664428710938, + "logps/rejected": -784.7410278320312, + "loss": 0.4795, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.4892544746398926, + "rewards/margins": 1.5444860458374023, + "rewards/rejected": -5.033740043640137, + "step": 4950 + }, + { + "epoch": 0.65, + "grad_norm": 24.625, + "learning_rate": 1.653122963053857e-06, + "logits/chosen": 0.6268502473831177, + "logits/rejected": 1.3769195079803467, + "logps/chosen": -597.7298583984375, + "logps/rejected": -719.7408447265625, + "loss": 0.3973, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.3582911491394043, + "rewards/margins": 1.5615208148956299, + "rewards/rejected": -4.919812202453613, + "step": 4960 + }, + { + "epoch": 0.65, + "grad_norm": 8.125, + "learning_rate": 1.6423864193573606e-06, + "logits/chosen": 0.6681777834892273, + "logits/rejected": 0.9838783144950867, + "logps/chosen": -571.2242431640625, + "logps/rejected": -712.8031005859375, + "loss": 0.496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1494784355163574, + "rewards/margins": 1.4304306507110596, + "rewards/rejected": -4.579909324645996, + "step": 4970 + }, + { + "epoch": 0.65, + "grad_norm": 13.375, + "learning_rate": 1.6316677731699286e-06, + "logits/chosen": 0.48633041977882385, + "logits/rejected": 2.1906113624572754, + "logps/chosen": -566.6742553710938, + "logps/rejected": -676.7261962890625, + "loss": 0.6053, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2935385704040527, + "rewards/margins": 1.423719048500061, + "rewards/rejected": -4.717257499694824, + "step": 4980 + }, + { + "epoch": 0.65, + "grad_norm": 26.75, + "learning_rate": 1.6209672481786302e-06, + "logits/chosen": 0.36567243933677673, + "logits/rejected": 1.2046464681625366, + "logps/chosen": -582.448486328125, + "logps/rejected": -712.8392944335938, + "loss": 0.4767, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0652308464050293, + "rewards/margins": 1.549925446510315, + "rewards/rejected": -4.615156173706055, + "step": 4990 + }, + { + "epoch": 0.65, + "grad_norm": 10.375, + "learning_rate": 1.6102850676923616e-06, + "logits/chosen": 0.5834435820579529, + "logits/rejected": 1.3819458484649658, + "logps/chosen": -551.6906127929688, + "logps/rejected": -691.9273681640625, + "loss": 0.488, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.029064655303955, + "rewards/margins": 1.308930516242981, + "rewards/rejected": -4.3379950523376465, + "step": 5000 + }, + { + "epoch": 0.65, + "eval_logits/chosen": 1.141782522201538, + "eval_logits/rejected": 1.9166334867477417, + "eval_logps/chosen": -589.7007446289062, + "eval_logps/rejected": -700.8106689453125, + "eval_loss": 0.4917171597480774, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -3.2507972717285156, + "eval_rewards/margins": 1.3115205764770508, + "eval_rewards/rejected": -4.562317848205566, + "eval_runtime": 1591.2907, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 5000 + }, + { + "epoch": 0.66, + "grad_norm": 17.0, + "learning_rate": 1.5996214546371888e-06, + "logits/chosen": 0.2420971840620041, + "logits/rejected": 1.3805590867996216, + "logps/chosen": -615.687744140625, + "logps/rejected": -716.4863891601562, + "loss": 0.531, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.3506503105163574, + "rewards/margins": 1.3097456693649292, + "rewards/rejected": -4.660396099090576, + "step": 5010 + }, + { + "epoch": 0.66, + "grad_norm": 13.875, + "learning_rate": 1.588976631551697e-06, + "logits/chosen": 0.24625544250011444, + "logits/rejected": 0.989874005317688, + "logps/chosen": -597.4532470703125, + "logps/rejected": -674.4581298828125, + "loss": 0.5112, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.2421135902404785, + "rewards/margins": 1.1127220392227173, + "rewards/rejected": -4.354835510253906, + "step": 5020 + }, + { + "epoch": 0.66, + "grad_norm": 10.8125, + "learning_rate": 1.5783508205823412e-06, + "logits/chosen": 0.15622951090335846, + "logits/rejected": 1.6825730800628662, + "logps/chosen": -613.8956298828125, + "logps/rejected": -752.7511596679688, + "loss": 0.4556, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3116347789764404, + "rewards/margins": 1.4797947406768799, + "rewards/rejected": -4.7914299964904785, + "step": 5030 + }, + { + "epoch": 0.66, + "grad_norm": 13.1875, + "learning_rate": 1.5677442434788143e-06, + "logits/chosen": 0.4642793536186218, + "logits/rejected": 0.683434247970581, + "logps/chosen": -559.9010620117188, + "logps/rejected": -681.8499755859375, + "loss": 0.4602, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.9780526161193848, + "rewards/margins": 1.4143049716949463, + "rewards/rejected": -4.392356872558594, + "step": 5040 + }, + { + "epoch": 0.66, + "grad_norm": 5.4375, + "learning_rate": 1.5571571215894181e-06, + "logits/chosen": -0.12815335392951965, + "logits/rejected": 1.1323789358139038, + "logps/chosen": -601.626953125, + "logps/rejected": -710.6660766601562, + "loss": 0.4372, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0496914386749268, + "rewards/margins": 1.4026107788085938, + "rewards/rejected": -4.452301979064941, + "step": 5050 + }, + { + "epoch": 0.66, + "grad_norm": 20.625, + "learning_rate": 1.5465896758564452e-06, + "logits/chosen": 0.6499842405319214, + "logits/rejected": 1.215057134628296, + "logps/chosen": -544.9089965820312, + "logps/rejected": -654.0910034179688, + "loss": 0.5112, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.162558078765869, + "rewards/margins": 1.325282335281372, + "rewards/rejected": -4.487840175628662, + "step": 5060 + }, + { + "epoch": 0.66, + "grad_norm": 15.5, + "learning_rate": 1.5360421268115653e-06, + "logits/chosen": 0.1901804506778717, + "logits/rejected": 0.6268029808998108, + "logps/chosen": -530.2285766601562, + "logps/rejected": -594.3751220703125, + "loss": 0.4743, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7991414070129395, + "rewards/margins": 1.081934928894043, + "rewards/rejected": -3.8810763359069824, + "step": 5070 + }, + { + "epoch": 0.66, + "grad_norm": 9.1875, + "learning_rate": 1.5255146945712267e-06, + "logits/chosen": 0.4935234487056732, + "logits/rejected": 0.8733431100845337, + "logps/chosen": -549.5220947265625, + "logps/rejected": -681.1325073242188, + "loss": 0.4263, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0330758094787598, + "rewards/margins": 1.3714685440063477, + "rewards/rejected": -4.404544353485107, + "step": 5080 + }, + { + "epoch": 0.67, + "grad_norm": 30.75, + "learning_rate": 1.5150075988320594e-06, + "logits/chosen": 0.6490196585655212, + "logits/rejected": 1.8105666637420654, + "logps/chosen": -576.24755859375, + "logps/rejected": -666.9951171875, + "loss": 0.5292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.25989031791687, + "rewards/margins": 1.3189462423324585, + "rewards/rejected": -4.578836441040039, + "step": 5090 + }, + { + "epoch": 0.67, + "grad_norm": 6.6875, + "learning_rate": 1.5045210588662929e-06, + "logits/chosen": 0.13218924403190613, + "logits/rejected": 1.4612019062042236, + "logps/chosen": -584.4468994140625, + "logps/rejected": -697.099609375, + "loss": 0.3606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9089255332946777, + "rewards/margins": 1.743561029434204, + "rewards/rejected": -4.652486324310303, + "step": 5100 + }, + { + "epoch": 0.67, + "eval_logits/chosen": 0.7813382744789124, + "eval_logits/rejected": 1.5031211376190186, + "eval_logps/chosen": -562.187744140625, + "eval_logps/rejected": -667.6594848632812, + "eval_loss": 0.49050119519233704, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": -2.9756669998168945, + "eval_rewards/margins": 1.2551382780075073, + "eval_rewards/rejected": -4.23080587387085, + "eval_runtime": 1591.2135, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 5100 + }, + { + "epoch": 0.67, + "grad_norm": 19.875, + "learning_rate": 1.4940552935171781e-06, + "logits/chosen": 0.20014624297618866, + "logits/rejected": 0.6813384294509888, + "logps/chosen": -586.2484130859375, + "logps/rejected": -698.6951904296875, + "loss": 0.5211, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9433157444000244, + "rewards/margins": 1.3401107788085938, + "rewards/rejected": -4.283426761627197, + "step": 5110 + }, + { + "epoch": 0.67, + "grad_norm": 15.4375, + "learning_rate": 1.483610521194419e-06, + "logits/chosen": -0.20831915736198425, + "logits/rejected": 0.7309791445732117, + "logps/chosen": -551.2055053710938, + "logps/rejected": -652.7286376953125, + "loss": 0.5044, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8443474769592285, + "rewards/margins": 1.2581770420074463, + "rewards/rejected": -4.102524757385254, + "step": 5120 + }, + { + "epoch": 0.67, + "grad_norm": 16.25, + "learning_rate": 1.4731869598696226e-06, + "logits/chosen": -0.0020657808054238558, + "logits/rejected": 0.6582788228988647, + "logps/chosen": -581.8043823242188, + "logps/rejected": -651.2916870117188, + "loss": 0.579, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0622143745422363, + "rewards/margins": 1.0450094938278198, + "rewards/rejected": -4.107223987579346, + "step": 5130 + }, + { + "epoch": 0.67, + "grad_norm": 13.4375, + "learning_rate": 1.4627848270717387e-06, + "logits/chosen": -0.0938434973359108, + "logits/rejected": 0.987653911113739, + "logps/chosen": -549.9574584960938, + "logps/rejected": -625.3163452148438, + "loss": 0.5751, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.074310302734375, + "rewards/margins": 1.1363435983657837, + "rewards/rejected": -4.210653305053711, + "step": 5140 + }, + { + "epoch": 0.67, + "grad_norm": 8.3125, + "learning_rate": 1.4524043398825277e-06, + "logits/chosen": -0.10383953154087067, + "logits/rejected": 0.6164258718490601, + "logps/chosen": -589.60107421875, + "logps/rejected": -697.7528076171875, + "loss": 0.4383, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9964165687561035, + "rewards/margins": 1.383199691772461, + "rewards/rejected": -4.3796162605285645, + "step": 5150 + }, + { + "epoch": 0.68, + "grad_norm": 23.0, + "learning_rate": 1.4420457149320299e-06, + "logits/chosen": -0.2617081105709076, + "logits/rejected": 0.8577947616577148, + "logps/chosen": -570.8038330078125, + "logps/rejected": -598.7860717773438, + "loss": 0.5498, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.934159278869629, + "rewards/margins": 1.0436315536499023, + "rewards/rejected": -3.977790355682373, + "step": 5160 + }, + { + "epoch": 0.68, + "grad_norm": 6.6875, + "learning_rate": 1.431709168394042e-06, + "logits/chosen": 0.26271653175354004, + "logits/rejected": 0.6604413390159607, + "logps/chosen": -487.736328125, + "logps/rejected": -591.9219360351562, + "loss": 0.5466, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.592200517654419, + "rewards/margins": 1.1626756191253662, + "rewards/rejected": -3.7548763751983643, + "step": 5170 + }, + { + "epoch": 0.68, + "grad_norm": 18.0, + "learning_rate": 1.4213949159816059e-06, + "logits/chosen": -0.018306344747543335, + "logits/rejected": 1.2069590091705322, + "logps/chosen": -524.5364990234375, + "logps/rejected": -616.7649536132812, + "loss": 0.4177, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.609614372253418, + "rewards/margins": 1.407111406326294, + "rewards/rejected": -4.016725540161133, + "step": 5180 + }, + { + "epoch": 0.68, + "grad_norm": 14.375, + "learning_rate": 1.4111031729425103e-06, + "logits/chosen": 0.18678632378578186, + "logits/rejected": 0.5336076617240906, + "logps/chosen": -523.8223876953125, + "logps/rejected": -633.44384765625, + "loss": 0.4914, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.944256067276001, + "rewards/margins": 1.1703563928604126, + "rewards/rejected": -4.114611625671387, + "step": 5190 + }, + { + "epoch": 0.68, + "grad_norm": 8.4375, + "learning_rate": 1.4008341540547965e-06, + "logits/chosen": 0.04942316561937332, + "logits/rejected": 0.69202721118927, + "logps/chosen": -545.2200927734375, + "logps/rejected": -652.6435546875, + "loss": 0.58, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.91707181930542, + "rewards/margins": 1.0721800327301025, + "rewards/rejected": -3.9892516136169434, + "step": 5200 + }, + { + "epoch": 0.68, + "eval_logits/chosen": 0.5849885940551758, + "eval_logits/rejected": 1.2839491367340088, + "eval_logps/chosen": -552.4491577148438, + "eval_logps/rejected": -654.7923583984375, + "eval_loss": 0.4896867573261261, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -2.878281593322754, + "eval_rewards/margins": 1.2238528728485107, + "eval_rewards/rejected": -4.1021342277526855, + "eval_runtime": 1591.3233, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 5200 + }, + { + "epoch": 0.68, + "grad_norm": 15.4375, + "learning_rate": 1.3905880736222737e-06, + "logits/chosen": 0.21044036746025085, + "logits/rejected": 0.3255331516265869, + "logps/chosen": -511.98419189453125, + "logps/rejected": -632.8362426757812, + "loss": 0.496, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.792902946472168, + "rewards/margins": 1.1052320003509521, + "rewards/rejected": -3.89813494682312, + "step": 5210 + }, + { + "epoch": 0.68, + "grad_norm": 18.0, + "learning_rate": 1.3803651454700531e-06, + "logits/chosen": -0.14034347236156464, + "logits/rejected": 0.3313951790332794, + "logps/chosen": -500.55133056640625, + "logps/rejected": -620.788818359375, + "loss": 0.4916, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6808226108551025, + "rewards/margins": 1.2449661493301392, + "rewards/rejected": -3.9257888793945312, + "step": 5220 + }, + { + "epoch": 0.68, + "grad_norm": 12.375, + "learning_rate": 1.3701655829400773e-06, + "logits/chosen": -0.19148483872413635, + "logits/rejected": 0.5032289624214172, + "logps/chosen": -535.7360229492188, + "logps/rejected": -652.48583984375, + "loss": 0.5032, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7009353637695312, + "rewards/margins": 1.0817861557006836, + "rewards/rejected": -3.782721757888794, + "step": 5230 + }, + { + "epoch": 0.69, + "grad_norm": 14.0, + "learning_rate": 1.3599895988866756e-06, + "logits/chosen": -0.21712355315685272, + "logits/rejected": 0.2153054028749466, + "logps/chosen": -520.4691772460938, + "logps/rejected": -643.4196166992188, + "loss": 0.4404, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.611964702606201, + "rewards/margins": 1.2944133281707764, + "rewards/rejected": -3.9063782691955566, + "step": 5240 + }, + { + "epoch": 0.69, + "grad_norm": 14.25, + "learning_rate": 1.3498374056721198e-06, + "logits/chosen": 0.18565845489501953, + "logits/rejected": 0.5457652807235718, + "logps/chosen": -583.9906616210938, + "logps/rejected": -677.7162475585938, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0645644664764404, + "rewards/margins": 1.1610159873962402, + "rewards/rejected": -4.225581169128418, + "step": 5250 + }, + { + "epoch": 0.69, + "grad_norm": 16.625, + "learning_rate": 1.3397092151621883e-06, + "logits/chosen": -0.12094113975763321, + "logits/rejected": 0.7155014872550964, + "logps/chosen": -605.60986328125, + "logps/rejected": -717.340576171875, + "loss": 0.5468, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.979651927947998, + "rewards/margins": 1.100801944732666, + "rewards/rejected": -4.080453872680664, + "step": 5260 + }, + { + "epoch": 0.69, + "grad_norm": 15.0625, + "learning_rate": 1.3296052387217484e-06, + "logits/chosen": 0.2536582350730896, + "logits/rejected": 0.5665210485458374, + "logps/chosen": -551.7531127929688, + "logps/rejected": -638.0416259765625, + "loss": 0.5232, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.853527069091797, + "rewards/margins": 1.0977308750152588, + "rewards/rejected": -3.9512581825256348, + "step": 5270 + }, + { + "epoch": 0.69, + "grad_norm": 20.5, + "learning_rate": 1.3195256872103476e-06, + "logits/chosen": 0.19485989212989807, + "logits/rejected": 0.4308921694755554, + "logps/chosen": -575.5075073242188, + "logps/rejected": -699.4771728515625, + "loss": 0.4572, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.144331932067871, + "rewards/margins": 1.1805914640426636, + "rewards/rejected": -4.324923515319824, + "step": 5280 + }, + { + "epoch": 0.69, + "grad_norm": 8.25, + "learning_rate": 1.3094707709778068e-06, + "logits/chosen": 0.03303980454802513, + "logits/rejected": 1.1130796670913696, + "logps/chosen": -588.6350708007812, + "logps/rejected": -650.6473388671875, + "loss": 0.5473, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.109278440475464, + "rewards/margins": 1.1929148435592651, + "rewards/rejected": -4.302193641662598, + "step": 5290 + }, + { + "epoch": 0.69, + "grad_norm": 21.5, + "learning_rate": 1.2994406998598364e-06, + "logits/chosen": 0.03543071821331978, + "logits/rejected": 0.4372057020664215, + "logps/chosen": -552.617919921875, + "logps/rejected": -666.7288208007812, + "loss": 0.5788, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1095051765441895, + "rewards/margins": 1.2071665525436401, + "rewards/rejected": -4.316671848297119, + "step": 5300 + }, + { + "epoch": 0.69, + "eval_logits/chosen": 0.7114199995994568, + "eval_logits/rejected": 1.4058908224105835, + "eval_logps/chosen": -570.6942749023438, + "eval_logps/rejected": -672.7390747070312, + "eval_loss": 0.49004074931144714, + "eval_rewards/accuracies": 0.7490000128746033, + "eval_rewards/chosen": -3.060732126235962, + "eval_rewards/margins": 1.2208691835403442, + "eval_rewards/rejected": -4.281601428985596, + "eval_runtime": 1592.4706, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 5300 + }, + { + "epoch": 0.69, + "grad_norm": 7.96875, + "learning_rate": 1.2894356831736558e-06, + "logits/chosen": 0.30859580636024475, + "logits/rejected": 1.3365111351013184, + "logps/chosen": -567.5665283203125, + "logps/rejected": -685.6886596679688, + "loss": 0.5104, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1270010471343994, + "rewards/margins": 1.6088695526123047, + "rewards/rejected": -4.735870838165283, + "step": 5310 + }, + { + "epoch": 0.7, + "grad_norm": 11.125, + "learning_rate": 1.2794559297136203e-06, + "logits/chosen": 0.17694668471813202, + "logits/rejected": 0.6812500953674316, + "logps/chosen": -587.8646850585938, + "logps/rejected": -652.458251953125, + "loss": 0.546, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1099853515625, + "rewards/margins": 1.0107700824737549, + "rewards/rejected": -4.120755195617676, + "step": 5320 + }, + { + "epoch": 0.7, + "grad_norm": 21.0, + "learning_rate": 1.2695016477468724e-06, + "logits/chosen": 0.2500815987586975, + "logits/rejected": 0.7284664511680603, + "logps/chosen": -609.7841796875, + "logps/rejected": -668.8241577148438, + "loss": 0.5396, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.290778398513794, + "rewards/margins": 0.8813145756721497, + "rewards/rejected": -4.172092914581299, + "step": 5330 + }, + { + "epoch": 0.7, + "grad_norm": 12.25, + "learning_rate": 1.2595730450089874e-06, + "logits/chosen": 0.20059093832969666, + "logits/rejected": 0.6966744065284729, + "logps/chosen": -577.68603515625, + "logps/rejected": -682.2274169921875, + "loss": 0.4409, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.045767068862915, + "rewards/margins": 1.3784563541412354, + "rewards/rejected": -4.42422342300415, + "step": 5340 + }, + { + "epoch": 0.7, + "grad_norm": 7.4375, + "learning_rate": 1.2496703286996433e-06, + "logits/chosen": 0.23528914153575897, + "logits/rejected": 0.8474369049072266, + "logps/chosen": -563.860107421875, + "logps/rejected": -701.6685180664062, + "loss": 0.3999, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1497082710266113, + "rewards/margins": 1.4197697639465332, + "rewards/rejected": -4.5694780349731445, + "step": 5350 + }, + { + "epoch": 0.7, + "grad_norm": 22.0, + "learning_rate": 1.2397937054782961e-06, + "logits/chosen": 0.3095719516277313, + "logits/rejected": 1.2036718130111694, + "logps/chosen": -568.3956298828125, + "logps/rejected": -663.9815063476562, + "loss": 0.504, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.285660982131958, + "rewards/margins": 1.2506592273712158, + "rewards/rejected": -4.536319732666016, + "step": 5360 + }, + { + "epoch": 0.7, + "grad_norm": 12.8125, + "learning_rate": 1.2299433814598635e-06, + "logits/chosen": 0.8002223968505859, + "logits/rejected": 1.2919270992279053, + "logps/chosen": -540.2605590820312, + "logps/rejected": -667.7479858398438, + "loss": 0.54, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3001084327697754, + "rewards/margins": 1.1389198303222656, + "rewards/rejected": -4.439028263092041, + "step": 5370 + }, + { + "epoch": 0.7, + "grad_norm": 19.0, + "learning_rate": 1.2201195622104265e-06, + "logits/chosen": 0.4608491063117981, + "logits/rejected": 1.0170303583145142, + "logps/chosen": -602.1793212890625, + "logps/rejected": -678.3692626953125, + "loss": 0.4949, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.355029582977295, + "rewards/margins": 1.0179849863052368, + "rewards/rejected": -4.3730149269104, + "step": 5380 + }, + { + "epoch": 0.71, + "grad_norm": 13.6875, + "learning_rate": 1.2103224527429417e-06, + "logits/chosen": 0.8049987554550171, + "logits/rejected": 0.7608388066291809, + "logps/chosen": -533.5645751953125, + "logps/rejected": -648.8416748046875, + "loss": 0.4595, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1001079082489014, + "rewards/margins": 1.1306575536727905, + "rewards/rejected": -4.2307658195495605, + "step": 5390 + }, + { + "epoch": 0.71, + "grad_norm": 14.3125, + "learning_rate": 1.2005522575129559e-06, + "logits/chosen": 0.47020992636680603, + "logits/rejected": 0.8666495084762573, + "logps/chosen": -507.0211486816406, + "logps/rejected": -639.0309448242188, + "loss": 0.4138, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9997565746307373, + "rewards/margins": 1.4226839542388916, + "rewards/rejected": -4.422440528869629, + "step": 5400 + }, + { + "epoch": 0.71, + "eval_logits/chosen": 0.8969926238059998, + "eval_logits/rejected": 1.6120911836624146, + "eval_logps/chosen": -599.54638671875, + "eval_logps/rejected": -706.5120239257812, + "eval_loss": 0.4909508526325226, + "eval_rewards/accuracies": 0.7515000104904175, + "eval_rewards/chosen": -3.3492534160614014, + "eval_rewards/margins": 1.270076870918274, + "eval_rewards/rejected": -4.619329929351807, + "eval_runtime": 1592.4682, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 5400 + }, + { + "epoch": 0.71, + "grad_norm": 8.625, + "learning_rate": 1.1908091804143469e-06, + "logits/chosen": -0.17856049537658691, + "logits/rejected": 1.5493866205215454, + "logps/chosen": -641.3067626953125, + "logps/rejected": -742.3198852539062, + "loss": 0.3837, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.4065234661102295, + "rewards/margins": 1.6241868734359741, + "rewards/rejected": -5.030710697174072, + "step": 5410 + }, + { + "epoch": 0.71, + "grad_norm": 20.625, + "learning_rate": 1.1810934247750649e-06, + "logits/chosen": 0.0274839885532856, + "logits/rejected": 0.9218126535415649, + "logps/chosen": -621.7474365234375, + "logps/rejected": -747.618896484375, + "loss": 0.5001, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.40272855758667, + "rewards/margins": 1.4314203262329102, + "rewards/rejected": -4.834149360656738, + "step": 5420 + }, + { + "epoch": 0.71, + "grad_norm": 34.75, + "learning_rate": 1.1714051933528881e-06, + "logits/chosen": 0.3180490732192993, + "logits/rejected": 0.5688341856002808, + "logps/chosen": -591.6409912109375, + "logps/rejected": -710.7434692382812, + "loss": 0.5916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.221553087234497, + "rewards/margins": 1.2600384950637817, + "rewards/rejected": -4.48159122467041, + "step": 5430 + }, + { + "epoch": 0.71, + "grad_norm": 17.75, + "learning_rate": 1.161744688331192e-06, + "logits/chosen": 0.5358083844184875, + "logits/rejected": 1.2289925813674927, + "logps/chosen": -574.173828125, + "logps/rejected": -725.67431640625, + "loss": 0.49, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.5042128562927246, + "rewards/margins": 1.3280826807022095, + "rewards/rejected": -4.8322954177856445, + "step": 5440 + }, + { + "epoch": 0.71, + "grad_norm": 38.5, + "learning_rate": 1.152112111314733e-06, + "logits/chosen": 0.5168737173080444, + "logits/rejected": 0.9716085195541382, + "logps/chosen": -652.9949951171875, + "logps/rejected": -748.0339965820312, + "loss": 0.6163, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.820329189300537, + "rewards/margins": 0.945980429649353, + "rewards/rejected": -4.76630973815918, + "step": 5450 + }, + { + "epoch": 0.71, + "grad_norm": 10.75, + "learning_rate": 1.142507663325439e-06, + "logits/chosen": 0.05302376672625542, + "logits/rejected": 0.6689623594284058, + "logps/chosen": -641.9430541992188, + "logps/rejected": -752.3848266601562, + "loss": 0.445, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.3499228954315186, + "rewards/margins": 1.3526192903518677, + "rewards/rejected": -4.702542781829834, + "step": 5460 + }, + { + "epoch": 0.72, + "grad_norm": 13.9375, + "learning_rate": 1.132931544798211e-06, + "logits/chosen": 0.10494127124547958, + "logits/rejected": 0.8696743249893188, + "logps/chosen": -627.115478515625, + "logps/rejected": -754.4387817382812, + "loss": 0.4932, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.5821831226348877, + "rewards/margins": 1.1890878677368164, + "rewards/rejected": -4.771270751953125, + "step": 5470 + }, + { + "epoch": 0.72, + "grad_norm": 21.5, + "learning_rate": 1.1233839555767482e-06, + "logits/chosen": -0.17560331523418427, + "logits/rejected": 1.348668098449707, + "logps/chosen": -647.2442626953125, + "logps/rejected": -714.8629760742188, + "loss": 0.5374, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.377307176589966, + "rewards/margins": 1.3940346240997314, + "rewards/rejected": -4.7713422775268555, + "step": 5480 + }, + { + "epoch": 0.72, + "grad_norm": 14.625, + "learning_rate": 1.1138650949093668e-06, + "logits/chosen": -0.04205578565597534, + "logits/rejected": 0.8225839734077454, + "logps/chosen": -592.0736083984375, + "logps/rejected": -687.6424560546875, + "loss": 0.5378, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.302886486053467, + "rewards/margins": 1.1337714195251465, + "rewards/rejected": -4.436657905578613, + "step": 5490 + }, + { + "epoch": 0.72, + "grad_norm": 38.25, + "learning_rate": 1.1043751614448543e-06, + "logits/chosen": -0.006318402476608753, + "logits/rejected": 0.9290812611579895, + "logps/chosen": -679.0123901367188, + "logps/rejected": -716.4671020507812, + "loss": 0.5737, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.7354023456573486, + "rewards/margins": 1.0157877206802368, + "rewards/rejected": -4.751189708709717, + "step": 5500 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 0.6955203413963318, + "eval_logits/rejected": 1.4061022996902466, + "eval_logps/chosen": -583.0511474609375, + "eval_logps/rejected": -689.724853515625, + "eval_loss": 0.48981621861457825, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -3.1843008995056152, + "eval_rewards/margins": 1.2671582698822021, + "eval_rewards/rejected": -4.4514594078063965, + "eval_runtime": 1592.2368, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 5500 + }, + { + "epoch": 0.72, + "grad_norm": 16.125, + "learning_rate": 1.0949143532283107e-06, + "logits/chosen": -0.15896812081336975, + "logits/rejected": 0.8019599914550781, + "logps/chosen": -567.5126953125, + "logps/rejected": -690.685302734375, + "loss": 0.4008, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.882983446121216, + "rewards/margins": 1.5050808191299438, + "rewards/rejected": -4.388064384460449, + "step": 5510 + }, + { + "epoch": 0.72, + "grad_norm": 16.25, + "learning_rate": 1.0854828676970275e-06, + "logits/chosen": 0.022523891180753708, + "logits/rejected": 0.5711814761161804, + "logps/chosen": -562.1071166992188, + "logps/rejected": -637.1767578125, + "loss": 0.5859, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.2052435874938965, + "rewards/margins": 0.9110970497131348, + "rewards/rejected": -4.116340160369873, + "step": 5520 + }, + { + "epoch": 0.72, + "grad_norm": 14.0625, + "learning_rate": 1.076080901676361e-06, + "logits/chosen": -0.07785852253437042, + "logits/rejected": 1.264184594154358, + "logps/chosen": -583.8575439453125, + "logps/rejected": -688.28125, + "loss": 0.5176, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1855790615081787, + "rewards/margins": 1.4519331455230713, + "rewards/rejected": -4.637512683868408, + "step": 5530 + }, + { + "epoch": 0.72, + "grad_norm": 13.125, + "learning_rate": 1.0667086513756234e-06, + "logits/chosen": 0.41335564851760864, + "logits/rejected": 1.4227181673049927, + "logps/chosen": -571.003173828125, + "logps/rejected": -673.028564453125, + "loss": 0.4494, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.126842737197876, + "rewards/margins": 1.4141932725906372, + "rewards/rejected": -4.5410356521606445, + "step": 5540 + }, + { + "epoch": 0.73, + "grad_norm": 20.875, + "learning_rate": 1.0573663123839912e-06, + "logits/chosen": 0.6815285682678223, + "logits/rejected": 1.2222025394439697, + "logps/chosen": -582.1532592773438, + "logps/rejected": -683.1765747070312, + "loss": 0.4605, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.343864917755127, + "rewards/margins": 1.2274417877197266, + "rewards/rejected": -4.5713067054748535, + "step": 5550 + }, + { + "epoch": 0.73, + "grad_norm": 11.25, + "learning_rate": 1.0480540796664251e-06, + "logits/chosen": 0.10144902765750885, + "logits/rejected": 0.8711962699890137, + "logps/chosen": -566.8845825195312, + "logps/rejected": -637.9187622070312, + "loss": 0.5358, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1263809204101562, + "rewards/margins": 0.9405500292778015, + "rewards/rejected": -4.066930770874023, + "step": 5560 + }, + { + "epoch": 0.73, + "grad_norm": 14.8125, + "learning_rate": 1.0387721475595978e-06, + "logits/chosen": -0.22912903130054474, + "logits/rejected": 0.9436883926391602, + "logps/chosen": -584.5687255859375, + "logps/rejected": -707.6832275390625, + "loss": 0.4363, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.001406192779541, + "rewards/margins": 1.3768746852874756, + "rewards/rejected": -4.378281116485596, + "step": 5570 + }, + { + "epoch": 0.73, + "grad_norm": 14.25, + "learning_rate": 1.0295207097678378e-06, + "logits/chosen": 0.14958836138248444, + "logits/rejected": 1.2157787084579468, + "logps/chosen": -631.9813842773438, + "logps/rejected": -706.6419677734375, + "loss": 0.4429, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2564949989318848, + "rewards/margins": 1.3660955429077148, + "rewards/rejected": -4.622590065002441, + "step": 5580 + }, + { + "epoch": 0.73, + "grad_norm": 11.875, + "learning_rate": 1.0202999593590924e-06, + "logits/chosen": 0.5636407732963562, + "logits/rejected": 1.1310765743255615, + "logps/chosen": -582.2732543945312, + "logps/rejected": -701.7555541992188, + "loss": 0.4697, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.436253309249878, + "rewards/margins": 1.2652666568756104, + "rewards/rejected": -4.7015204429626465, + "step": 5590 + }, + { + "epoch": 0.73, + "grad_norm": 20.875, + "learning_rate": 1.011110088760891e-06, + "logits/chosen": 0.11347509920597076, + "logits/rejected": 1.787253737449646, + "logps/chosen": -631.408203125, + "logps/rejected": -739.8702392578125, + "loss": 0.4249, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.302558183670044, + "rewards/margins": 1.6660197973251343, + "rewards/rejected": -4.968577861785889, + "step": 5600 + }, + { + "epoch": 0.73, + "eval_logits/chosen": 0.9557819366455078, + "eval_logits/rejected": 1.7110285758972168, + "eval_logps/chosen": -599.0980224609375, + "eval_logps/rejected": -712.3563842773438, + "eval_loss": 0.4918125569820404, + "eval_rewards/accuracies": 0.7490000128746033, + "eval_rewards/chosen": -3.3447697162628174, + "eval_rewards/margins": 1.3330047130584717, + "eval_rewards/rejected": -4.677773952484131, + "eval_runtime": 1592.4673, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 5600 + }, + { + "epoch": 0.73, + "grad_norm": 7.78125, + "learning_rate": 1.0019512897563347e-06, + "logits/chosen": 0.25357693433761597, + "logits/rejected": 1.0136306285858154, + "logps/chosen": -611.2633666992188, + "logps/rejected": -688.2657470703125, + "loss": 0.6093, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4056782722473145, + "rewards/margins": 1.086458444595337, + "rewards/rejected": -4.492136001586914, + "step": 5610 + }, + { + "epoch": 0.74, + "grad_norm": 23.0, + "learning_rate": 9.928237534800935e-07, + "logits/chosen": 0.1514512449502945, + "logits/rejected": 1.30339777469635, + "logps/chosen": -624.1841430664062, + "logps/rejected": -706.040283203125, + "loss": 0.4836, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.171689748764038, + "rewards/margins": 1.3224647045135498, + "rewards/rejected": -4.494154453277588, + "step": 5620 + }, + { + "epoch": 0.74, + "grad_norm": 14.6875, + "learning_rate": 9.837276704144174e-07, + "logits/chosen": 0.6703234314918518, + "logits/rejected": 1.3387033939361572, + "logps/chosen": -538.6806640625, + "logps/rejected": -719.9829711914062, + "loss": 0.4899, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.253155469894409, + "rewards/margins": 1.5623925924301147, + "rewards/rejected": -4.815548419952393, + "step": 5630 + }, + { + "epoch": 0.74, + "grad_norm": 19.125, + "learning_rate": 9.746632303851569e-07, + "logits/chosen": 0.35827261209487915, + "logits/rejected": 1.1380624771118164, + "logps/chosen": -574.8941650390625, + "logps/rejected": -700.0415649414062, + "loss": 0.4769, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.280141830444336, + "rewards/margins": 1.4382925033569336, + "rewards/rejected": -4.7184343338012695, + "step": 5640 + }, + { + "epoch": 0.74, + "grad_norm": 19.625, + "learning_rate": 9.65630622557809e-07, + "logits/chosen": 0.5788121223449707, + "logits/rejected": 0.6934975385665894, + "logps/chosen": -583.9860229492188, + "logps/rejected": -676.6206665039062, + "loss": 0.5894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.547583818435669, + "rewards/margins": 0.9842156171798706, + "rewards/rejected": -4.531800270080566, + "step": 5650 + }, + { + "epoch": 0.74, + "grad_norm": 16.75, + "learning_rate": 9.56630035433561e-07, + "logits/chosen": 0.36284205317497253, + "logits/rejected": 1.3190720081329346, + "logps/chosen": -594.0944213867188, + "logps/rejected": -695.3111572265625, + "loss": 0.5701, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.2505619525909424, + "rewards/margins": 1.1941272020339966, + "rewards/rejected": -4.4446892738342285, + "step": 5660 + }, + { + "epoch": 0.74, + "grad_norm": 13.0, + "learning_rate": 9.476616568453659e-07, + "logits/chosen": 0.47623515129089355, + "logits/rejected": 1.598987340927124, + "logps/chosen": -607.935546875, + "logps/rejected": -715.621826171875, + "loss": 0.5532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.381014347076416, + "rewards/margins": 1.5085983276367188, + "rewards/rejected": -4.889612674713135, + "step": 5670 + }, + { + "epoch": 0.74, + "grad_norm": 33.0, + "learning_rate": 9.387256739540162e-07, + "logits/chosen": 0.35760414600372314, + "logits/rejected": 1.0268436670303345, + "logps/chosen": -598.7515869140625, + "logps/rejected": -728.8211669921875, + "loss": 0.5329, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.3422341346740723, + "rewards/margins": 1.1946098804473877, + "rewards/rejected": -4.536844253540039, + "step": 5680 + }, + { + "epoch": 0.74, + "grad_norm": 15.6875, + "learning_rate": 9.298222732442377e-07, + "logits/chosen": 0.9866177439689636, + "logits/rejected": 1.629563570022583, + "logps/chosen": -525.8026123046875, + "logps/rejected": -634.7586669921875, + "loss": 0.5319, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.24723482131958, + "rewards/margins": 1.033001184463501, + "rewards/rejected": -4.28023624420166, + "step": 5690 + }, + { + "epoch": 0.75, + "grad_norm": 15.8125, + "learning_rate": 9.20951640520803e-07, + "logits/chosen": 0.5580320954322815, + "logits/rejected": 1.2365334033966064, + "logps/chosen": -517.0693359375, + "logps/rejected": -630.359619140625, + "loss": 0.5457, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.20147705078125, + "rewards/margins": 1.087882161140442, + "rewards/rejected": -4.289359092712402, + "step": 5700 + }, + { + "epoch": 0.75, + "eval_logits/chosen": 0.9922085404396057, + "eval_logits/rejected": 1.7372267246246338, + "eval_logps/chosen": -592.4562377929688, + "eval_logps/rejected": -701.9876708984375, + "eval_loss": 0.4897419512271881, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -3.278352737426758, + "eval_rewards/margins": 1.295735239982605, + "eval_rewards/rejected": -4.574087619781494, + "eval_runtime": 1592.5991, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 5700 + }, + { + "epoch": 0.75, + "grad_norm": 17.75, + "learning_rate": 9.121139609046484e-07, + "logits/chosen": 0.1699611395597458, + "logits/rejected": 0.9091424942016602, + "logps/chosen": -586.0159301757812, + "logps/rejected": -743.6968383789062, + "loss": 0.5009, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.4288792610168457, + "rewards/margins": 1.5915720462799072, + "rewards/rejected": -5.020451545715332, + "step": 5710 + }, + { + "epoch": 0.75, + "grad_norm": 12.375, + "learning_rate": 9.033094188290121e-07, + "logits/chosen": 0.05250464752316475, + "logits/rejected": 0.9846289753913879, + "logps/chosen": -579.5111083984375, + "logps/rejected": -695.6873168945312, + "loss": 0.4425, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.0725390911102295, + "rewards/margins": 1.472003698348999, + "rewards/rejected": -4.5445427894592285, + "step": 5720 + }, + { + "epoch": 0.75, + "grad_norm": 19.25, + "learning_rate": 8.945381980355889e-07, + "logits/chosen": 0.48727503418922424, + "logits/rejected": 1.2392418384552002, + "logps/chosen": -584.3572998046875, + "logps/rejected": -646.3514404296875, + "loss": 0.5468, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0813841819763184, + "rewards/margins": 1.3014500141143799, + "rewards/rejected": -4.382834434509277, + "step": 5730 + }, + { + "epoch": 0.75, + "grad_norm": 6.75, + "learning_rate": 8.858004815706919e-07, + "logits/chosen": 0.1299765259027481, + "logits/rejected": 1.4009642601013184, + "logps/chosen": -597.6690063476562, + "logps/rejected": -718.248046875, + "loss": 0.4349, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3065361976623535, + "rewards/margins": 1.5114755630493164, + "rewards/rejected": -4.81801176071167, + "step": 5740 + }, + { + "epoch": 0.75, + "grad_norm": 7.53125, + "learning_rate": 8.77096451781432e-07, + "logits/chosen": 0.08577007800340652, + "logits/rejected": 1.0180623531341553, + "logps/chosen": -612.34912109375, + "logps/rejected": -712.0511474609375, + "loss": 0.4807, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2083351612091064, + "rewards/margins": 1.256860613822937, + "rewards/rejected": -4.465196132659912, + "step": 5750 + }, + { + "epoch": 0.75, + "grad_norm": 11.875, + "learning_rate": 8.684262903119165e-07, + "logits/chosen": 0.5749713778495789, + "logits/rejected": 1.654703140258789, + "logps/chosen": -603.7852172851562, + "logps/rejected": -712.9544677734375, + "loss": 0.3899, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.355541944503784, + "rewards/margins": 1.4487149715423584, + "rewards/rejected": -4.804256916046143, + "step": 5760 + }, + { + "epoch": 0.76, + "grad_norm": 13.0625, + "learning_rate": 8.597901780994525e-07, + "logits/chosen": 0.9888712167739868, + "logits/rejected": 1.534705400466919, + "logps/chosen": -561.2703857421875, + "logps/rejected": -702.0836791992188, + "loss": 0.455, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1287243366241455, + "rewards/margins": 1.4464129209518433, + "rewards/rejected": -4.575137138366699, + "step": 5770 + }, + { + "epoch": 0.76, + "grad_norm": 7.375, + "learning_rate": 8.511882953707773e-07, + "logits/chosen": 0.46083322167396545, + "logits/rejected": 1.4080815315246582, + "logps/chosen": -536.21533203125, + "logps/rejected": -711.8817749023438, + "loss": 0.4216, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1101584434509277, + "rewards/margins": 1.7463359832763672, + "rewards/rejected": -4.856494426727295, + "step": 5780 + }, + { + "epoch": 0.76, + "grad_norm": 19.0, + "learning_rate": 8.426208216382944e-07, + "logits/chosen": 0.6455115079879761, + "logits/rejected": 1.258669137954712, + "logps/chosen": -593.31640625, + "logps/rejected": -674.8057861328125, + "loss": 0.469, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.150735378265381, + "rewards/margins": 1.3739995956420898, + "rewards/rejected": -4.524735450744629, + "step": 5790 + }, + { + "epoch": 0.76, + "grad_norm": 17.25, + "learning_rate": 8.340879356963245e-07, + "logits/chosen": 0.442685604095459, + "logits/rejected": 1.396315336227417, + "logps/chosen": -635.2542114257812, + "logps/rejected": -683.7772827148438, + "loss": 0.5287, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.358551025390625, + "rewards/margins": 1.11667799949646, + "rewards/rejected": -4.475229263305664, + "step": 5800 + }, + { + "epoch": 0.76, + "eval_logits/chosen": 1.2037140130996704, + "eval_logits/rejected": 1.9801844358444214, + "eval_logps/chosen": -596.2890014648438, + "eval_logps/rejected": -710.5777587890625, + "eval_loss": 0.4920007884502411, + "eval_rewards/accuracies": 0.7494999766349792, + "eval_rewards/chosen": -3.3166792392730713, + "eval_rewards/margins": 1.343308448791504, + "eval_rewards/rejected": -4.659987926483154, + "eval_runtime": 1591.929, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 5800 + }, + { + "epoch": 0.76, + "grad_norm": 26.375, + "learning_rate": 8.255898156173777e-07, + "logits/chosen": 0.31752336025238037, + "logits/rejected": 1.4020841121673584, + "logps/chosen": -596.1682739257812, + "logps/rejected": -698.3563232421875, + "loss": 0.585, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.314462184906006, + "rewards/margins": 1.2807135581970215, + "rewards/rejected": -4.5951762199401855, + "step": 5810 + }, + { + "epoch": 0.76, + "grad_norm": 12.9375, + "learning_rate": 8.171266387484389e-07, + "logits/chosen": 0.34116196632385254, + "logits/rejected": 1.6084121465682983, + "logps/chosen": -616.3291625976562, + "logps/rejected": -757.4080810546875, + "loss": 0.5309, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.6172256469726562, + "rewards/margins": 1.4249513149261475, + "rewards/rejected": -5.042177200317383, + "step": 5820 + }, + { + "epoch": 0.76, + "grad_norm": 9.25, + "learning_rate": 8.086985817072604e-07, + "logits/chosen": 0.22463031113147736, + "logits/rejected": 1.289733648300171, + "logps/chosen": -579.5349731445312, + "logps/rejected": -716.548095703125, + "loss": 0.5499, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.202744960784912, + "rewards/margins": 1.1577074527740479, + "rewards/rejected": -4.360452175140381, + "step": 5830 + }, + { + "epoch": 0.76, + "grad_norm": 9.75, + "learning_rate": 8.003058203786835e-07, + "logits/chosen": 0.5225099325180054, + "logits/rejected": 1.9025760889053345, + "logps/chosen": -554.4903564453125, + "logps/rejected": -697.1652221679688, + "loss": 0.3968, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.1905171871185303, + "rewards/margins": 1.6949659585952759, + "rewards/rejected": -4.885483264923096, + "step": 5840 + }, + { + "epoch": 0.77, + "grad_norm": 22.5, + "learning_rate": 7.91948529910963e-07, + "logits/chosen": 0.3707103729248047, + "logits/rejected": 1.3207480907440186, + "logps/chosen": -581.2924194335938, + "logps/rejected": -676.443603515625, + "loss": 0.4742, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.2187423706054688, + "rewards/margins": 1.3398678302764893, + "rewards/rejected": -4.558609962463379, + "step": 5850 + }, + { + "epoch": 0.77, + "grad_norm": 7.1875, + "learning_rate": 7.836268847121126e-07, + "logits/chosen": 0.5717436075210571, + "logits/rejected": 1.3999663591384888, + "logps/chosen": -559.6635131835938, + "logps/rejected": -694.9573974609375, + "loss": 0.4302, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0068411827087402, + "rewards/margins": 1.5318539142608643, + "rewards/rejected": -4.538695335388184, + "step": 5860 + }, + { + "epoch": 0.77, + "grad_norm": 10.5, + "learning_rate": 7.753410584462681e-07, + "logits/chosen": 0.04864966869354248, + "logits/rejected": 1.1614755392074585, + "logps/chosen": -604.5387573242188, + "logps/rejected": -691.7279052734375, + "loss": 0.4828, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.178351879119873, + "rewards/margins": 1.2994771003723145, + "rewards/rejected": -4.477828502655029, + "step": 5870 + }, + { + "epoch": 0.77, + "grad_norm": 9.3125, + "learning_rate": 7.670912240300596e-07, + "logits/chosen": -0.06198643520474434, + "logits/rejected": 1.4561761617660522, + "logps/chosen": -610.7146606445312, + "logps/rejected": -719.8155517578125, + "loss": 0.3941, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0767502784729004, + "rewards/margins": 1.5548076629638672, + "rewards/rejected": -4.631557464599609, + "step": 5880 + }, + { + "epoch": 0.77, + "grad_norm": 11.3125, + "learning_rate": 7.588775536290035e-07, + "logits/chosen": 0.4458466172218323, + "logits/rejected": 1.3329976797103882, + "logps/chosen": -572.161865234375, + "logps/rejected": -688.1071166992188, + "loss": 0.3865, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.103865146636963, + "rewards/margins": 1.4781702756881714, + "rewards/rejected": -4.582036018371582, + "step": 5890 + }, + { + "epoch": 0.77, + "grad_norm": 8.9375, + "learning_rate": 7.507002186539147e-07, + "logits/chosen": 0.36371809244155884, + "logits/rejected": 0.9052112698554993, + "logps/chosen": -596.0297241210938, + "logps/rejected": -702.3975830078125, + "loss": 0.5286, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.279501438140869, + "rewards/margins": 1.1486151218414307, + "rewards/rejected": -4.428115367889404, + "step": 5900 + }, + { + "epoch": 0.77, + "eval_logits/chosen": 1.13614821434021, + "eval_logits/rejected": 1.9038183689117432, + "eval_logps/chosen": -587.6722412109375, + "eval_logps/rejected": -701.1276245117188, + "eval_loss": 0.4918730854988098, + "eval_rewards/accuracies": 0.7465000152587891, + "eval_rewards/chosen": -3.2305116653442383, + "eval_rewards/margins": 1.3349748849868774, + "eval_rewards/rejected": -4.565486907958984, + "eval_runtime": 1591.0126, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 5900 + }, + { + "epoch": 0.77, + "grad_norm": 9.5, + "learning_rate": 7.425593897573216e-07, + "logits/chosen": 0.2484653890132904, + "logits/rejected": 1.1581590175628662, + "logps/chosen": -580.0093994140625, + "logps/rejected": -667.7286376953125, + "loss": 0.4072, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.056926727294922, + "rewards/margins": 1.5210171937942505, + "rewards/rejected": -4.577943325042725, + "step": 5910 + }, + { + "epoch": 0.77, + "grad_norm": 14.375, + "learning_rate": 7.344552368299088e-07, + "logits/chosen": 0.7004015445709229, + "logits/rejected": 1.145263433456421, + "logps/chosen": -525.1219482421875, + "logps/rejected": -703.9129638671875, + "loss": 0.432, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.113039493560791, + "rewards/margins": 1.5690702199935913, + "rewards/rejected": -4.682109832763672, + "step": 5920 + }, + { + "epoch": 0.78, + "grad_norm": 21.0, + "learning_rate": 7.26387928996973e-07, + "logits/chosen": -0.07604257762432098, + "logits/rejected": 1.174023985862732, + "logps/chosen": -568.1380615234375, + "logps/rejected": -627.6046142578125, + "loss": 0.5232, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.1161561012268066, + "rewards/margins": 1.02237868309021, + "rewards/rejected": -4.1385345458984375, + "step": 5930 + }, + { + "epoch": 0.78, + "grad_norm": 20.25, + "learning_rate": 7.183576346148899e-07, + "logits/chosen": 0.3111092746257782, + "logits/rejected": 1.6442598104476929, + "logps/chosen": -615.7824096679688, + "logps/rejected": -697.6546020507812, + "loss": 0.4505, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.264331102371216, + "rewards/margins": 1.4083982706069946, + "rewards/rejected": -4.672729015350342, + "step": 5940 + }, + { + "epoch": 0.78, + "grad_norm": 15.625, + "learning_rate": 7.103645212676044e-07, + "logits/chosen": 0.3181416392326355, + "logits/rejected": 1.6206680536270142, + "logps/chosen": -572.1453247070312, + "logps/rejected": -702.2391967773438, + "loss": 0.4264, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0080885887145996, + "rewards/margins": 1.6056219339370728, + "rewards/rejected": -4.613710403442383, + "step": 5950 + }, + { + "epoch": 0.78, + "grad_norm": 26.125, + "learning_rate": 7.024087557631318e-07, + "logits/chosen": 0.088649682700634, + "logits/rejected": 1.1377710103988647, + "logps/chosen": -626.9284057617188, + "logps/rejected": -731.088134765625, + "loss": 0.4376, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1198205947875977, + "rewards/margins": 1.4172180891036987, + "rewards/rejected": -4.537038326263428, + "step": 5960 + }, + { + "epoch": 0.78, + "grad_norm": 11.3125, + "learning_rate": 6.944905041300739e-07, + "logits/chosen": 0.10835651308298111, + "logits/rejected": 0.9804534912109375, + "logps/chosen": -591.5462646484375, + "logps/rejected": -722.29150390625, + "loss": 0.4895, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1032094955444336, + "rewards/margins": 1.463866949081421, + "rewards/rejected": -4.567076206207275, + "step": 5970 + }, + { + "epoch": 0.78, + "grad_norm": 11.5, + "learning_rate": 6.866099316141606e-07, + "logits/chosen": 0.04711395502090454, + "logits/rejected": 0.4793902337551117, + "logps/chosen": -625.3909912109375, + "logps/rejected": -680.4255981445312, + "loss": 0.5642, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2621428966522217, + "rewards/margins": 1.0643290281295776, + "rewards/rejected": -4.32647180557251, + "step": 5980 + }, + { + "epoch": 0.78, + "grad_norm": 8.0, + "learning_rate": 6.787672026747946e-07, + "logits/chosen": 0.6254671216011047, + "logits/rejected": 2.311389923095703, + "logps/chosen": -620.357666015625, + "logps/rejected": -669.0045166015625, + "loss": 0.5704, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.432579517364502, + "rewards/margins": 1.1018991470336914, + "rewards/rejected": -4.534478664398193, + "step": 5990 + }, + { + "epoch": 0.79, + "grad_norm": 17.625, + "learning_rate": 6.709624809816223e-07, + "logits/chosen": 0.38161593675613403, + "logits/rejected": 1.1075352430343628, + "logps/chosen": -637.8136596679688, + "logps/rejected": -724.2991943359375, + "loss": 0.5147, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.4568965435028076, + "rewards/margins": 1.2378437519073486, + "rewards/rejected": -4.694740295410156, + "step": 6000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": 1.1726433038711548, + "eval_logits/rejected": 1.9302729368209839, + "eval_logps/chosen": -596.0759887695312, + "eval_logps/rejected": -708.931884765625, + "eval_loss": 0.49101775884628296, + "eval_rewards/accuracies": 0.7505000233650208, + "eval_rewards/chosen": -3.314549207687378, + "eval_rewards/margins": 1.3289803266525269, + "eval_rewards/rejected": -4.643529415130615, + "eval_runtime": 1590.6827, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 6000 + }, + { + "epoch": 0.79, + "grad_norm": 9.8125, + "learning_rate": 6.6319592941112e-07, + "logits/chosen": -0.11990109831094742, + "logits/rejected": 1.020079493522644, + "logps/chosen": -551.75439453125, + "logps/rejected": -731.1846923828125, + "loss": 0.4526, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1442012786865234, + "rewards/margins": 1.6808693408966064, + "rewards/rejected": -4.825070381164551, + "step": 6010 + }, + { + "epoch": 0.79, + "grad_norm": 18.125, + "learning_rate": 6.554677100431927e-07, + "logits/chosen": 0.13416549563407898, + "logits/rejected": 1.3785282373428345, + "logps/chosen": -633.962890625, + "logps/rejected": -651.944580078125, + "loss": 0.5893, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3228306770324707, + "rewards/margins": 0.920928955078125, + "rewards/rejected": -4.243759632110596, + "step": 6020 + }, + { + "epoch": 0.79, + "grad_norm": 21.5, + "learning_rate": 6.4777798415779e-07, + "logits/chosen": 0.44740551710128784, + "logits/rejected": 1.2276852130889893, + "logps/chosen": -571.357421875, + "logps/rejected": -697.7033081054688, + "loss": 0.5064, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.125563144683838, + "rewards/margins": 1.19227135181427, + "rewards/rejected": -4.317834377288818, + "step": 6030 + }, + { + "epoch": 0.79, + "grad_norm": 11.875, + "learning_rate": 6.401269122315451e-07, + "logits/chosen": 0.5647193193435669, + "logits/rejected": 0.9143550992012024, + "logps/chosen": -606.7120971679688, + "logps/rejected": -702.9698486328125, + "loss": 0.4996, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.2641494274139404, + "rewards/margins": 1.107541799545288, + "rewards/rejected": -4.371691703796387, + "step": 6040 + }, + { + "epoch": 0.79, + "grad_norm": 12.9375, + "learning_rate": 6.325146539344196e-07, + "logits/chosen": 0.36607789993286133, + "logits/rejected": 0.7721070051193237, + "logps/chosen": -600.3197631835938, + "logps/rejected": -734.3776245117188, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.335095167160034, + "rewards/margins": 1.2280336618423462, + "rewards/rejected": -4.56312894821167, + "step": 6050 + }, + { + "epoch": 0.79, + "grad_norm": 18.75, + "learning_rate": 6.249413681263782e-07, + "logits/chosen": 0.6674261093139648, + "logits/rejected": 1.2911916971206665, + "logps/chosen": -580.8428955078125, + "logps/rejected": -668.2259521484375, + "loss": 0.4819, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.283857822418213, + "rewards/margins": 1.2953107357025146, + "rewards/rejected": -4.57916784286499, + "step": 6060 + }, + { + "epoch": 0.79, + "grad_norm": 10.8125, + "learning_rate": 6.174072128540686e-07, + "logits/chosen": 0.3818279802799225, + "logits/rejected": 1.0219361782073975, + "logps/chosen": -553.1798095703125, + "logps/rejected": -647.6849365234375, + "loss": 0.4719, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.125274419784546, + "rewards/margins": 1.3637750148773193, + "rewards/rejected": -4.489049434661865, + "step": 6070 + }, + { + "epoch": 0.8, + "grad_norm": 20.625, + "learning_rate": 6.099123453475245e-07, + "logits/chosen": -0.03337569907307625, + "logits/rejected": 0.600936770439148, + "logps/chosen": -599.7701416015625, + "logps/rejected": -710.7029418945312, + "loss": 0.5543, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.229990005493164, + "rewards/margins": 1.0938842296600342, + "rewards/rejected": -4.323873996734619, + "step": 6080 + }, + { + "epoch": 0.8, + "grad_norm": 14.1875, + "learning_rate": 6.024569220168836e-07, + "logits/chosen": 0.9164674878120422, + "logits/rejected": 1.220346450805664, + "logps/chosen": -546.9329833984375, + "logps/rejected": -662.7405395507812, + "loss": 0.5744, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.2453300952911377, + "rewards/margins": 1.0616782903671265, + "rewards/rejected": -4.307008266448975, + "step": 6090 + }, + { + "epoch": 0.8, + "grad_norm": 13.875, + "learning_rate": 5.950410984491268e-07, + "logits/chosen": 0.7717280387878418, + "logits/rejected": 1.1919779777526855, + "logps/chosen": -580.9427490234375, + "logps/rejected": -707.2449951171875, + "loss": 0.4478, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.240947723388672, + "rewards/margins": 1.3304286003112793, + "rewards/rejected": -4.571376323699951, + "step": 6100 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 1.0185655355453491, + "eval_logits/rejected": 1.7621047496795654, + "eval_logps/chosen": -585.310546875, + "eval_logps/rejected": -694.713134765625, + "eval_loss": 0.4886416494846344, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -3.206895589828491, + "eval_rewards/margins": 1.2944468259811401, + "eval_rewards/rejected": -4.501342296600342, + "eval_runtime": 1591.1979, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 6100 + }, + { + "epoch": 0.8, + "grad_norm": 7.75, + "learning_rate": 5.876650294048262e-07, + "logits/chosen": -0.022813748568296432, + "logits/rejected": 0.9088582992553711, + "logps/chosen": -578.6947021484375, + "logps/rejected": -691.7832641601562, + "loss": 0.4224, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.993137836456299, + "rewards/margins": 1.3222205638885498, + "rewards/rejected": -4.315358638763428, + "step": 6110 + }, + { + "epoch": 0.8, + "grad_norm": 12.6875, + "learning_rate": 5.8032886881492e-07, + "logits/chosen": 0.0022028745152056217, + "logits/rejected": 1.4448870420455933, + "logps/chosen": -580.8271484375, + "logps/rejected": -657.9542236328125, + "loss": 0.4356, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1111884117126465, + "rewards/margins": 1.3057067394256592, + "rewards/rejected": -4.416895389556885, + "step": 6120 + }, + { + "epoch": 0.8, + "grad_norm": 5.96875, + "learning_rate": 5.730327697774988e-07, + "logits/chosen": 0.32467782497406006, + "logits/rejected": 0.7888936996459961, + "logps/chosen": -599.9251708984375, + "logps/rejected": -760.7127685546875, + "loss": 0.434, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.1381869316101074, + "rewards/margins": 1.4439395666122437, + "rewards/rejected": -4.582126617431641, + "step": 6130 + }, + { + "epoch": 0.8, + "grad_norm": 13.625, + "learning_rate": 5.657768845546068e-07, + "logits/chosen": 0.8671263456344604, + "logits/rejected": 1.679828405380249, + "logps/chosen": -563.6256103515625, + "logps/rejected": -678.2106323242188, + "loss": 0.5529, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.3380939960479736, + "rewards/margins": 1.1758853197097778, + "rewards/rejected": -4.513979434967041, + "step": 6140 + }, + { + "epoch": 0.8, + "grad_norm": 15.125, + "learning_rate": 5.585613645690713e-07, + "logits/chosen": -0.0506153330206871, + "logits/rejected": 0.853299617767334, + "logps/chosen": -596.2897338867188, + "logps/rejected": -728.8367309570312, + "loss": 0.447, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2465102672576904, + "rewards/margins": 1.3228228092193604, + "rewards/rejected": -4.569333076477051, + "step": 6150 + }, + { + "epoch": 0.81, + "grad_norm": 10.625, + "learning_rate": 5.513863604013355e-07, + "logits/chosen": 0.5937899351119995, + "logits/rejected": 1.3581970930099487, + "logps/chosen": -574.1300659179688, + "logps/rejected": -655.2301025390625, + "loss": 0.4973, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.2126641273498535, + "rewards/margins": 1.2130138874053955, + "rewards/rejected": -4.42567777633667, + "step": 6160 + }, + { + "epoch": 0.81, + "grad_norm": 15.9375, + "learning_rate": 5.442520217863215e-07, + "logits/chosen": 0.37454861402511597, + "logits/rejected": 1.0216349363327026, + "logps/chosen": -633.1695556640625, + "logps/rejected": -760.712890625, + "loss": 0.4117, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1701767444610596, + "rewards/margins": 1.5251020193099976, + "rewards/rejected": -4.695279121398926, + "step": 6170 + }, + { + "epoch": 0.81, + "grad_norm": 37.5, + "learning_rate": 5.371584976103034e-07, + "logits/chosen": 0.19896648824214935, + "logits/rejected": 0.8611049652099609, + "logps/chosen": -606.3096313476562, + "logps/rejected": -702.2210083007812, + "loss": 0.499, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.2889304161071777, + "rewards/margins": 1.151632308959961, + "rewards/rejected": -4.440562725067139, + "step": 6180 + }, + { + "epoch": 0.81, + "grad_norm": 9.8125, + "learning_rate": 5.301059359077987e-07, + "logits/chosen": 0.5175013542175293, + "logits/rejected": 1.247930645942688, + "logps/chosen": -570.9851684570312, + "logps/rejected": -701.9575805664062, + "loss": 0.4799, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3489022254943848, + "rewards/margins": 1.3883564472198486, + "rewards/rejected": -4.737257957458496, + "step": 6190 + }, + { + "epoch": 0.81, + "grad_norm": 23.5, + "learning_rate": 5.230944838584806e-07, + "logits/chosen": 0.49456173181533813, + "logits/rejected": 0.8041893243789673, + "logps/chosen": -576.3448486328125, + "logps/rejected": -717.2078247070312, + "loss": 0.5236, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.191760540008545, + "rewards/margins": 1.3665167093276978, + "rewards/rejected": -4.558276653289795, + "step": 6200 + }, + { + "epoch": 0.81, + "eval_logits/chosen": 1.0793569087982178, + "eval_logits/rejected": 1.8309375047683716, + "eval_logps/chosen": -596.6957397460938, + "eval_logps/rejected": -709.5498657226562, + "eval_loss": 0.4901345372200012, + "eval_rewards/accuracies": 0.7494999766349792, + "eval_rewards/chosen": -3.320746898651123, + "eval_rewards/margins": 1.3289613723754883, + "eval_rewards/rejected": -4.6497087478637695, + "eval_runtime": 1591.779, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 6200 + }, + { + "epoch": 0.81, + "grad_norm": 12.5, + "learning_rate": 5.161242877841083e-07, + "logits/chosen": 0.840091347694397, + "logits/rejected": 1.9746043682098389, + "logps/chosen": -618.3326416015625, + "logps/rejected": -682.1724853515625, + "loss": 0.5775, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6712398529052734, + "rewards/margins": 1.017067790031433, + "rewards/rejected": -4.688307762145996, + "step": 6210 + }, + { + "epoch": 0.81, + "grad_norm": 27.125, + "learning_rate": 5.091954931454682e-07, + "logits/chosen": 0.28132981061935425, + "logits/rejected": 1.3453963994979858, + "logps/chosen": -594.2634887695312, + "logps/rejected": -707.8575439453125, + "loss": 0.529, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.3353753089904785, + "rewards/margins": 1.407685399055481, + "rewards/rejected": -4.743060111999512, + "step": 6220 + }, + { + "epoch": 0.82, + "grad_norm": 6.59375, + "learning_rate": 5.023082445393446e-07, + "logits/chosen": 0.30338960886001587, + "logits/rejected": 0.6292451024055481, + "logps/chosen": -613.8154296875, + "logps/rejected": -710.1831665039062, + "loss": 0.476, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.403163433074951, + "rewards/margins": 1.109538197517395, + "rewards/rejected": -4.512701988220215, + "step": 6230 + }, + { + "epoch": 0.82, + "grad_norm": 15.3125, + "learning_rate": 4.95462685695498e-07, + "logits/chosen": 0.29414045810699463, + "logits/rejected": 0.9581305384635925, + "logps/chosen": -574.3023681640625, + "logps/rejected": -708.0538940429688, + "loss": 0.4095, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.2103676795959473, + "rewards/margins": 1.4833682775497437, + "rewards/rejected": -4.6937360763549805, + "step": 6240 + }, + { + "epoch": 0.82, + "grad_norm": 6.4375, + "learning_rate": 4.88658959473666e-07, + "logits/chosen": -0.03567713499069214, + "logits/rejected": 0.8967617154121399, + "logps/chosen": -583.6680908203125, + "logps/rejected": -677.9716186523438, + "loss": 0.4951, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.315932035446167, + "rewards/margins": 1.201798439025879, + "rewards/rejected": -4.517730236053467, + "step": 6250 + }, + { + "epoch": 0.82, + "grad_norm": 11.0625, + "learning_rate": 4.818972078605821e-07, + "logits/chosen": 0.3524642586708069, + "logits/rejected": 1.6111385822296143, + "logps/chosen": -596.5489501953125, + "logps/rejected": -727.8389892578125, + "loss": 0.4802, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2479538917541504, + "rewards/margins": 1.6509485244750977, + "rewards/rejected": -4.898902416229248, + "step": 6260 + }, + { + "epoch": 0.82, + "grad_norm": 8.4375, + "learning_rate": 4.7517757196701514e-07, + "logits/chosen": 0.23523840308189392, + "logits/rejected": 1.2825753688812256, + "logps/chosen": -628.8624267578125, + "logps/rejected": -738.5863037109375, + "loss": 0.4968, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.5499215126037598, + "rewards/margins": 1.364435076713562, + "rewards/rejected": -4.914356708526611, + "step": 6270 + }, + { + "epoch": 0.82, + "grad_norm": 15.0625, + "learning_rate": 4.6850019202482193e-07, + "logits/chosen": 0.2345297634601593, + "logits/rejected": 1.5206564664840698, + "logps/chosen": -624.873046875, + "logps/rejected": -731.8216552734375, + "loss": 0.4436, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.4002845287323, + "rewards/margins": 1.4457454681396484, + "rewards/rejected": -4.846030235290527, + "step": 6280 + }, + { + "epoch": 0.82, + "grad_norm": 7.9375, + "learning_rate": 4.618652073840188e-07, + "logits/chosen": 0.2798255681991577, + "logits/rejected": 1.280543327331543, + "logps/chosen": -546.3389282226562, + "logps/rejected": -665.4715576171875, + "loss": 0.476, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.009049654006958, + "rewards/margins": 1.4991285800933838, + "rewards/rejected": -4.508177757263184, + "step": 6290 + }, + { + "epoch": 0.82, + "grad_norm": 8.5, + "learning_rate": 4.5527275650987965e-07, + "logits/chosen": 0.2119041234254837, + "logits/rejected": 0.7161015272140503, + "logps/chosen": -608.0989990234375, + "logps/rejected": -738.2981567382812, + "loss": 0.5079, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.027043581008911, + "rewards/margins": 1.2831947803497314, + "rewards/rejected": -4.310237884521484, + "step": 6300 + }, + { + "epoch": 0.82, + "eval_logits/chosen": 1.032199501991272, + "eval_logits/rejected": 1.7747009992599487, + "eval_logps/chosen": -595.4583129882812, + "eval_logps/rejected": -706.781982421875, + "eval_loss": 0.4889962375164032, + "eval_rewards/accuracies": 0.7494999766349792, + "eval_rewards/chosen": -3.3083724975585938, + "eval_rewards/margins": 1.3136582374572754, + "eval_rewards/rejected": -4.622030735015869, + "eval_runtime": 1592.0101, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 6300 + }, + { + "epoch": 0.83, + "grad_norm": 11.75, + "learning_rate": 4.487229769800394e-07, + "logits/chosen": 0.3432023823261261, + "logits/rejected": 0.8247334361076355, + "logps/chosen": -597.5015869140625, + "logps/rejected": -651.3562622070312, + "loss": 0.5573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.2687923908233643, + "rewards/margins": 0.9014599919319153, + "rewards/rejected": -4.170252323150635, + "step": 6310 + }, + { + "epoch": 0.83, + "grad_norm": 29.0, + "learning_rate": 4.422160054816285e-07, + "logits/chosen": 0.20175309479236603, + "logits/rejected": 0.72217857837677, + "logps/chosen": -546.6419067382812, + "logps/rejected": -679.1134033203125, + "loss": 0.5044, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.018216371536255, + "rewards/margins": 1.276477336883545, + "rewards/rejected": -4.294693946838379, + "step": 6320 + }, + { + "epoch": 0.83, + "grad_norm": 17.875, + "learning_rate": 4.35751977808416e-07, + "logits/chosen": 0.7091022729873657, + "logits/rejected": 1.5161218643188477, + "logps/chosen": -610.763427734375, + "logps/rejected": -721.1217651367188, + "loss": 0.4028, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.3477084636688232, + "rewards/margins": 1.465746521949768, + "rewards/rejected": -4.813455104827881, + "step": 6330 + }, + { + "epoch": 0.83, + "grad_norm": 13.25, + "learning_rate": 4.293310288579794e-07, + "logits/chosen": 0.09544781595468521, + "logits/rejected": 0.8159352540969849, + "logps/chosen": -580.238037109375, + "logps/rejected": -642.5474243164062, + "loss": 0.5207, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.303569793701172, + "rewards/margins": 0.9509817361831665, + "rewards/rejected": -4.254551887512207, + "step": 6340 + }, + { + "epoch": 0.83, + "grad_norm": 17.0, + "learning_rate": 4.2295329262888733e-07, + "logits/chosen": 0.020851727575063705, + "logits/rejected": 1.0389108657836914, + "logps/chosen": -595.221923828125, + "logps/rejected": -701.2500610351562, + "loss": 0.5483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.224815845489502, + "rewards/margins": 1.1890679597854614, + "rewards/rejected": -4.413883686065674, + "step": 6350 + }, + { + "epoch": 0.83, + "grad_norm": 54.75, + "learning_rate": 4.1661890221790316e-07, + "logits/chosen": 0.35152512788772583, + "logits/rejected": 1.3631591796875, + "logps/chosen": -667.9931030273438, + "logps/rejected": -738.0748291015625, + "loss": 0.6645, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.6574642658233643, + "rewards/margins": 0.9594003558158875, + "rewards/rejected": -4.616864204406738, + "step": 6360 + }, + { + "epoch": 0.83, + "grad_norm": 15.375, + "learning_rate": 4.103279898172072e-07, + "logits/chosen": 0.49099016189575195, + "logits/rejected": 0.9257776141166687, + "logps/chosen": -560.7960205078125, + "logps/rejected": -641.2603149414062, + "loss": 0.5571, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.2241997718811035, + "rewards/margins": 1.0578079223632812, + "rewards/rejected": -4.282007694244385, + "step": 6370 + }, + { + "epoch": 0.83, + "grad_norm": 8.5, + "learning_rate": 4.040806867116401e-07, + "logits/chosen": 0.4509049952030182, + "logits/rejected": 1.1655193567276, + "logps/chosen": -627.8912353515625, + "logps/rejected": -751.0303955078125, + "loss": 0.4506, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.246445417404175, + "rewards/margins": 1.472038745880127, + "rewards/rejected": -4.718484401702881, + "step": 6380 + }, + { + "epoch": 0.84, + "grad_norm": 17.875, + "learning_rate": 3.978771232759615e-07, + "logits/chosen": 0.5065037608146667, + "logits/rejected": 1.0776305198669434, + "logps/chosen": -563.1023559570312, + "logps/rejected": -674.8141479492188, + "loss": 0.4742, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.238140821456909, + "rewards/margins": 1.2237097024917603, + "rewards/rejected": -4.461850643157959, + "step": 6390 + }, + { + "epoch": 0.84, + "grad_norm": 13.875, + "learning_rate": 3.917174289721276e-07, + "logits/chosen": 0.5604764819145203, + "logits/rejected": 0.7512072324752808, + "logps/chosen": -558.1353759765625, + "logps/rejected": -691.1140747070312, + "loss": 0.4942, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.1084206104278564, + "rewards/margins": 1.387665033340454, + "rewards/rejected": -4.496085166931152, + "step": 6400 + }, + { + "epoch": 0.84, + "eval_logits/chosen": 1.026822566986084, + "eval_logits/rejected": 1.7716290950775146, + "eval_logps/chosen": -590.8313598632812, + "eval_logps/rejected": -701.301025390625, + "eval_loss": 0.48912298679351807, + "eval_rewards/accuracies": 0.7494999766349792, + "eval_rewards/chosen": -3.26210355758667, + "eval_rewards/margins": 1.3051165342330933, + "eval_rewards/rejected": -4.5672197341918945, + "eval_runtime": 1591.6194, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 6400 + }, + { + "epoch": 0.84, + "grad_norm": 25.0, + "learning_rate": 3.856017323465938e-07, + "logits/chosen": 0.08190663903951645, + "logits/rejected": 0.6202396750450134, + "logps/chosen": -553.3712768554688, + "logps/rejected": -693.1614990234375, + "loss": 0.4305, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.957423210144043, + "rewards/margins": 1.4459336996078491, + "rewards/rejected": -4.403356552124023, + "step": 6410 + }, + { + "epoch": 0.84, + "grad_norm": 19.125, + "learning_rate": 3.7953016102762695e-07, + "logits/chosen": 0.3419317603111267, + "logits/rejected": 1.0710508823394775, + "logps/chosen": -599.0252685546875, + "logps/rejected": -694.2352294921875, + "loss": 0.5246, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.2364730834960938, + "rewards/margins": 1.1976615190505981, + "rewards/rejected": -4.434134483337402, + "step": 6420 + }, + { + "epoch": 0.84, + "grad_norm": 15.1875, + "learning_rate": 3.7350284172264493e-07, + "logits/chosen": 0.45977815985679626, + "logits/rejected": 1.0664544105529785, + "logps/chosen": -610.8163452148438, + "logps/rejected": -696.8948974609375, + "loss": 0.5006, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.2424583435058594, + "rewards/margins": 1.174291968345642, + "rewards/rejected": -4.416750431060791, + "step": 6430 + }, + { + "epoch": 0.84, + "grad_norm": 8.875, + "learning_rate": 3.67519900215573e-07, + "logits/chosen": 0.2526703476905823, + "logits/rejected": 1.0846112966537476, + "logps/chosen": -596.3863525390625, + "logps/rejected": -693.46875, + "loss": 0.3881, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.027198314666748, + "rewards/margins": 1.4641358852386475, + "rewards/rejected": -4.491333961486816, + "step": 6440 + }, + { + "epoch": 0.84, + "grad_norm": 4.84375, + "learning_rate": 3.615814613642174e-07, + "logits/chosen": 0.18329405784606934, + "logits/rejected": 0.8037030100822449, + "logps/chosen": -567.7757568359375, + "logps/rejected": -723.3435668945312, + "loss": 0.4066, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0279901027679443, + "rewards/margins": 1.6228275299072266, + "rewards/rejected": -4.65081787109375, + "step": 6450 + }, + { + "epoch": 0.85, + "grad_norm": 18.625, + "learning_rate": 3.5568764909765795e-07, + "logits/chosen": 0.4307138919830322, + "logits/rejected": 1.433720350265503, + "logps/chosen": -633.9922485351562, + "logps/rejected": -681.5325317382812, + "loss": 0.5682, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.525965929031372, + "rewards/margins": 1.0324019193649292, + "rewards/rejected": -4.558367729187012, + "step": 6460 + }, + { + "epoch": 0.85, + "grad_norm": 7.78125, + "learning_rate": 3.498385864136672e-07, + "logits/chosen": 0.23667888343334198, + "logits/rejected": 0.9870807528495789, + "logps/chosen": -586.2492065429688, + "logps/rejected": -757.7203369140625, + "loss": 0.4709, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.298717975616455, + "rewards/margins": 1.508500337600708, + "rewards/rejected": -4.807218074798584, + "step": 6470 + }, + { + "epoch": 0.85, + "grad_norm": 19.375, + "learning_rate": 3.440343953761363e-07, + "logits/chosen": 0.3215915858745575, + "logits/rejected": 1.3016798496246338, + "logps/chosen": -536.6361694335938, + "logps/rejected": -679.41650390625, + "loss": 0.3826, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9511194229125977, + "rewards/margins": 1.6483557224273682, + "rewards/rejected": -4.599474906921387, + "step": 6480 + }, + { + "epoch": 0.85, + "grad_norm": 16.0, + "learning_rate": 3.382751971125345e-07, + "logits/chosen": 0.7153388261795044, + "logits/rejected": 1.2328914403915405, + "logps/chosen": -628.5037841796875, + "logps/rejected": -717.3402099609375, + "loss": 0.5905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.6130409240722656, + "rewards/margins": 1.0365272760391235, + "rewards/rejected": -4.649567604064941, + "step": 6490 + }, + { + "epoch": 0.85, + "grad_norm": 10.0, + "learning_rate": 3.3256111181137753e-07, + "logits/chosen": 0.12050239741802216, + "logits/rejected": 1.2598720788955688, + "logps/chosen": -616.6473388671875, + "logps/rejected": -675.8792724609375, + "loss": 0.4688, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2907919883728027, + "rewards/margins": 1.167119026184082, + "rewards/rejected": -4.457911491394043, + "step": 6500 + }, + { + "epoch": 0.85, + "eval_logits/chosen": 1.040233850479126, + "eval_logits/rejected": 1.7862778902053833, + "eval_logps/chosen": -593.2546997070312, + "eval_logps/rejected": -704.1409912109375, + "eval_loss": 0.48905226588249207, + "eval_rewards/accuracies": 0.7505000233650208, + "eval_rewards/chosen": -3.286336660385132, + "eval_rewards/margins": 1.3092845678329468, + "eval_rewards/rejected": -4.595621109008789, + "eval_runtime": 1590.4062, + "eval_samples_per_second": 1.258, + "eval_steps_per_second": 0.314, + "step": 6500 + }, + { + "epoch": 0.85, + "grad_norm": 17.0, + "learning_rate": 3.2689225871971905e-07, + "logits/chosen": 0.286967933177948, + "logits/rejected": 1.4739683866500854, + "logps/chosen": -553.6991577148438, + "logps/rejected": -687.5084228515625, + "loss": 0.4413, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.2888290882110596, + "rewards/margins": 1.5385186672210693, + "rewards/rejected": -4.827347755432129, + "step": 6510 + }, + { + "epoch": 0.85, + "grad_norm": 11.1875, + "learning_rate": 3.2126875614066523e-07, + "logits/chosen": -0.10929323732852936, + "logits/rejected": 1.3522584438323975, + "logps/chosen": -552.710205078125, + "logps/rejected": -673.8087768554688, + "loss": 0.477, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0392167568206787, + "rewards/margins": 1.356864333152771, + "rewards/rejected": -4.39608097076416, + "step": 6520 + }, + { + "epoch": 0.85, + "grad_norm": 16.125, + "learning_rate": 3.156907214309024e-07, + "logits/chosen": 0.7336920499801636, + "logits/rejected": 0.7236236333847046, + "logps/chosen": -560.0294189453125, + "logps/rejected": -728.3599243164062, + "loss": 0.4632, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.27976655960083, + "rewards/margins": 1.335707426071167, + "rewards/rejected": -4.615473747253418, + "step": 6530 + }, + { + "epoch": 0.86, + "grad_norm": 11.0625, + "learning_rate": 3.1015827099824923e-07, + "logits/chosen": 0.3031115233898163, + "logits/rejected": 1.1659104824066162, + "logps/chosen": -624.4817504882812, + "logps/rejected": -751.3743896484375, + "loss": 0.5015, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.492037534713745, + "rewards/margins": 1.4371260404586792, + "rewards/rejected": -4.929163932800293, + "step": 6540 + }, + { + "epoch": 0.86, + "grad_norm": 8.1875, + "learning_rate": 3.0467152029922926e-07, + "logits/chosen": 0.5154403448104858, + "logits/rejected": 0.4819985032081604, + "logps/chosen": -536.7814331054688, + "logps/rejected": -670.2327270507812, + "loss": 0.5557, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.159590482711792, + "rewards/margins": 1.1023595333099365, + "rewards/rejected": -4.2619500160217285, + "step": 6550 + }, + { + "epoch": 0.86, + "grad_norm": 11.4375, + "learning_rate": 2.992305838366591e-07, + "logits/chosen": 0.2807716131210327, + "logits/rejected": 1.1566855907440186, + "logps/chosen": -624.9320678710938, + "logps/rejected": -709.5386352539062, + "loss": 0.4996, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.347169876098633, + "rewards/margins": 1.311095952987671, + "rewards/rejected": -4.658266067504883, + "step": 6560 + }, + { + "epoch": 0.86, + "grad_norm": 11.625, + "learning_rate": 2.938355751572583e-07, + "logits/chosen": 0.4360644817352295, + "logits/rejected": 1.5401076078414917, + "logps/chosen": -575.8522338867188, + "logps/rejected": -724.4762573242188, + "loss": 0.402, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2187793254852295, + "rewards/margins": 1.7893708944320679, + "rewards/rejected": -5.008150100708008, + "step": 6570 + }, + { + "epoch": 0.86, + "grad_norm": 20.625, + "learning_rate": 2.8848660684928307e-07, + "logits/chosen": 0.6239246726036072, + "logits/rejected": 1.2716379165649414, + "logps/chosen": -583.64697265625, + "logps/rejected": -739.271728515625, + "loss": 0.3718, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1440670490264893, + "rewards/margins": 1.6428172588348389, + "rewards/rejected": -4.786884307861328, + "step": 6580 + }, + { + "epoch": 0.86, + "grad_norm": 11.375, + "learning_rate": 2.8318379054017383e-07, + "logits/chosen": 0.28017204999923706, + "logits/rejected": 0.6756846904754639, + "logps/chosen": -587.0732421875, + "logps/rejected": -677.6159057617188, + "loss": 0.6121, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.404522657394409, + "rewards/margins": 0.8272771835327148, + "rewards/rejected": -4.231800079345703, + "step": 6590 + }, + { + "epoch": 0.86, + "grad_norm": 9.3125, + "learning_rate": 2.779272368942246e-07, + "logits/chosen": 0.1423681676387787, + "logits/rejected": 1.035640001296997, + "logps/chosen": -550.0023193359375, + "logps/rejected": -636.7756958007812, + "loss": 0.5062, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1119656562805176, + "rewards/margins": 1.2890288829803467, + "rewards/rejected": -4.400994300842285, + "step": 6600 + }, + { + "epoch": 0.86, + "eval_logits/chosen": 1.0260632038116455, + "eval_logits/rejected": 1.769521951675415, + "eval_logps/chosen": -593.8478393554688, + "eval_logps/rejected": -704.8690795898438, + "eval_loss": 0.4889431893825531, + "eval_rewards/accuracies": 0.7484999895095825, + "eval_rewards/chosen": -3.2922675609588623, + "eval_rewards/margins": 1.3106337785720825, + "eval_rewards/rejected": -4.602901458740234, + "eval_runtime": 1590.2933, + "eval_samples_per_second": 1.258, + "eval_steps_per_second": 0.314, + "step": 6600 + }, + { + "epoch": 0.86, + "grad_norm": 11.1875, + "learning_rate": 2.7271705561027986e-07, + "logits/chosen": 0.3130180835723877, + "logits/rejected": 0.7469149827957153, + "logps/chosen": -608.0856323242188, + "logps/rejected": -725.796875, + "loss": 0.5438, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.389724016189575, + "rewards/margins": 1.0343810319900513, + "rewards/rejected": -4.424104690551758, + "step": 6610 + }, + { + "epoch": 0.87, + "grad_norm": 28.0, + "learning_rate": 2.6755335541943677e-07, + "logits/chosen": 0.19360849261283875, + "logits/rejected": 1.0395077466964722, + "logps/chosen": -598.2835693359375, + "logps/rejected": -704.2481689453125, + "loss": 0.4799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.331153392791748, + "rewards/margins": 1.3434398174285889, + "rewards/rejected": -4.674593448638916, + "step": 6620 + }, + { + "epoch": 0.87, + "grad_norm": 10.4375, + "learning_rate": 2.62436244082781e-07, + "logits/chosen": 0.21957476437091827, + "logits/rejected": 0.7514572143554688, + "logps/chosen": -567.141845703125, + "logps/rejected": -697.9176025390625, + "loss": 0.4481, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1435608863830566, + "rewards/margins": 1.4471392631530762, + "rewards/rejected": -4.590699672698975, + "step": 6630 + }, + { + "epoch": 0.87, + "grad_norm": 7.125, + "learning_rate": 2.5736582838913836e-07, + "logits/chosen": 0.22806484997272491, + "logits/rejected": 0.9933506846427917, + "logps/chosen": -621.5804443359375, + "logps/rejected": -714.3753662109375, + "loss": 0.46, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.256880521774292, + "rewards/margins": 1.346673846244812, + "rewards/rejected": -4.6035542488098145, + "step": 6640 + }, + { + "epoch": 0.87, + "grad_norm": 16.0, + "learning_rate": 2.5234221415284363e-07, + "logits/chosen": 0.29249635338783264, + "logits/rejected": 1.1430519819259644, + "logps/chosen": -599.8883666992188, + "logps/rejected": -712.6788330078125, + "loss": 0.4889, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0099844932556152, + "rewards/margins": 1.4878642559051514, + "rewards/rejected": -4.4978485107421875, + "step": 6650 + }, + { + "epoch": 0.87, + "grad_norm": 14.5625, + "learning_rate": 2.4736550621153375e-07, + "logits/chosen": 0.7163133025169373, + "logits/rejected": 1.4217084646224976, + "logps/chosen": -590.7764892578125, + "logps/rejected": -702.0975952148438, + "loss": 0.5071, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.3552703857421875, + "rewards/margins": 1.2537267208099365, + "rewards/rejected": -4.608997344970703, + "step": 6660 + }, + { + "epoch": 0.87, + "grad_norm": 15.3125, + "learning_rate": 2.424358084239609e-07, + "logits/chosen": 0.09558672457933426, + "logits/rejected": 1.1775563955307007, + "logps/chosen": -608.3162231445312, + "logps/rejected": -671.51220703125, + "loss": 0.5659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.291945219039917, + "rewards/margins": 1.0342015027999878, + "rewards/rejected": -4.326146125793457, + "step": 6670 + }, + { + "epoch": 0.87, + "grad_norm": 18.375, + "learning_rate": 2.3755322366782158e-07, + "logits/chosen": 0.4346315860748291, + "logits/rejected": 0.7198947072029114, + "logps/chosen": -591.690673828125, + "logps/rejected": -744.840087890625, + "loss": 0.5241, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.6358695030212402, + "rewards/margins": 1.2703872919082642, + "rewards/rejected": -4.906256675720215, + "step": 6680 + }, + { + "epoch": 0.88, + "grad_norm": 7.71875, + "learning_rate": 2.3271785383761431e-07, + "logits/chosen": 0.19641201198101044, + "logits/rejected": 1.4842437505722046, + "logps/chosen": -642.49853515625, + "logps/rejected": -760.596435546875, + "loss": 0.4263, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.3886337280273438, + "rewards/margins": 1.6620216369628906, + "rewards/rejected": -5.050655364990234, + "step": 6690 + }, + { + "epoch": 0.88, + "grad_norm": 17.125, + "learning_rate": 2.2792979984250978e-07, + "logits/chosen": 0.26097798347473145, + "logits/rejected": 1.3378149271011353, + "logps/chosen": -581.4967651367188, + "logps/rejected": -659.17578125, + "loss": 0.574, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.349140167236328, + "rewards/margins": 1.2462130784988403, + "rewards/rejected": -4.595353603363037, + "step": 6700 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.0140420198440552, + "eval_logits/rejected": 1.757304072380066, + "eval_logps/chosen": -592.4088745117188, + "eval_logps/rejected": -703.44287109375, + "eval_loss": 0.4887068569660187, + "eval_rewards/accuracies": 0.7494999766349792, + "eval_rewards/chosen": -3.277878522872925, + "eval_rewards/margins": 1.3107616901397705, + "eval_rewards/rejected": -4.588640213012695, + "eval_runtime": 1592.3239, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 6700 + }, + { + "epoch": 0.88, + "grad_norm": 6.375, + "learning_rate": 2.231891616042453e-07, + "logits/chosen": 0.10554889589548111, + "logits/rejected": 1.7582786083221436, + "logps/chosen": -627.1198120117188, + "logps/rejected": -784.3391723632812, + "loss": 0.3874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3611323833465576, + "rewards/margins": 1.8259894847869873, + "rewards/rejected": -5.187121868133545, + "step": 6710 + }, + { + "epoch": 0.88, + "grad_norm": 20.25, + "learning_rate": 2.1849603805504328e-07, + "logits/chosen": 0.6380084753036499, + "logits/rejected": 1.636945366859436, + "logps/chosen": -549.2799072265625, + "logps/rejected": -701.5870971679688, + "loss": 0.4267, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3033134937286377, + "rewards/margins": 1.6281543970108032, + "rewards/rejected": -4.9314680099487305, + "step": 6720 + }, + { + "epoch": 0.88, + "grad_norm": 17.5, + "learning_rate": 2.1385052713554066e-07, + "logits/chosen": -0.12183733284473419, + "logits/rejected": 1.4319367408752441, + "logps/chosen": -617.0516967773438, + "logps/rejected": -651.3189086914062, + "loss": 0.4917, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2507731914520264, + "rewards/margins": 1.1351972818374634, + "rewards/rejected": -4.385970592498779, + "step": 6730 + }, + { + "epoch": 0.88, + "grad_norm": 6.3125, + "learning_rate": 2.0925272579274873e-07, + "logits/chosen": 0.11045311391353607, + "logits/rejected": 0.18677489459514618, + "logps/chosen": -570.3370361328125, + "logps/rejected": -668.0396728515625, + "loss": 0.5353, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.2531864643096924, + "rewards/margins": 1.0719640254974365, + "rewards/rejected": -4.325150489807129, + "step": 6740 + }, + { + "epoch": 0.88, + "grad_norm": 17.5, + "learning_rate": 2.047027299780302e-07, + "logits/chosen": 0.4011760354042053, + "logits/rejected": 1.157820224761963, + "logps/chosen": -590.8807373046875, + "logps/rejected": -714.6888427734375, + "loss": 0.5, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.279855251312256, + "rewards/margins": 1.36622953414917, + "rewards/rejected": -4.646084785461426, + "step": 6750 + }, + { + "epoch": 0.88, + "grad_norm": 13.875, + "learning_rate": 2.0020063464509492e-07, + "logits/chosen": 0.5368996858596802, + "logits/rejected": 1.6831638813018799, + "logps/chosen": -544.4091186523438, + "logps/rejected": -690.4616088867188, + "loss": 0.4165, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.224353313446045, + "rewards/margins": 1.5393102169036865, + "rewards/rejected": -4.7636637687683105, + "step": 6760 + }, + { + "epoch": 0.89, + "grad_norm": 7.34375, + "learning_rate": 1.957465337480191e-07, + "logits/chosen": 0.08371684700250626, + "logits/rejected": 1.7062151432037354, + "logps/chosen": -610.7371826171875, + "logps/rejected": -743.8865966796875, + "loss": 0.3572, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.165255308151245, + "rewards/margins": 1.8801519870758057, + "rewards/rejected": -5.045407295227051, + "step": 6770 + }, + { + "epoch": 0.89, + "grad_norm": 17.5, + "learning_rate": 1.9134052023928622e-07, + "logits/chosen": 0.4712475836277008, + "logits/rejected": 1.06361722946167, + "logps/chosen": -583.5921020507812, + "logps/rejected": -697.35302734375, + "loss": 0.4598, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1947989463806152, + "rewards/margins": 1.396549105644226, + "rewards/rejected": -4.591347694396973, + "step": 6780 + }, + { + "epoch": 0.89, + "grad_norm": 17.125, + "learning_rate": 1.8698268606784392e-07, + "logits/chosen": 0.4350413680076599, + "logits/rejected": 1.8006197214126587, + "logps/chosen": -607.4656372070312, + "logps/rejected": -689.8092041015625, + "loss": 0.4508, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.259523391723633, + "rewards/margins": 1.4721763134002686, + "rewards/rejected": -4.7316999435424805, + "step": 6790 + }, + { + "epoch": 0.89, + "grad_norm": 22.75, + "learning_rate": 1.826731221771866e-07, + "logits/chosen": 0.5540640950202942, + "logits/rejected": 0.7710912227630615, + "logps/chosen": -551.3302612304688, + "logps/rejected": -673.9688720703125, + "loss": 0.5737, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.183842420578003, + "rewards/margins": 1.0950297117233276, + "rewards/rejected": -4.278872013092041, + "step": 6800 + }, + { + "epoch": 0.89, + "eval_logits/chosen": 1.0125845670700073, + "eval_logits/rejected": 1.755982518196106, + "eval_logps/chosen": -593.7937622070312, + "eval_logps/rejected": -704.9940185546875, + "eval_loss": 0.4887009859085083, + "eval_rewards/accuracies": 0.7509999871253967, + "eval_rewards/chosen": -3.291727066040039, + "eval_rewards/margins": 1.312423586845398, + "eval_rewards/rejected": -4.604150295257568, + "eval_runtime": 1591.5754, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 6800 + }, + { + "epoch": 0.89, + "grad_norm": 19.0, + "learning_rate": 1.7841191850345967e-07, + "logits/chosen": 0.5574437379837036, + "logits/rejected": 1.2674754858016968, + "logps/chosen": -571.6987915039062, + "logps/rejected": -722.4050903320312, + "loss": 0.4244, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2846603393554688, + "rewards/margins": 1.5775883197784424, + "rewards/rejected": -4.862248420715332, + "step": 6810 + }, + { + "epoch": 0.89, + "grad_norm": 22.625, + "learning_rate": 1.7419916397357905e-07, + "logits/chosen": 0.705112099647522, + "logits/rejected": 1.1643078327178955, + "logps/chosen": -580.8120727539062, + "logps/rejected": -690.5592041015625, + "loss": 0.5157, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.315786838531494, + "rewards/margins": 1.2436031103134155, + "rewards/rejected": -4.559390068054199, + "step": 6820 + }, + { + "epoch": 0.89, + "grad_norm": 8.8125, + "learning_rate": 1.700349465033782e-07, + "logits/chosen": 0.3245560824871063, + "logits/rejected": 1.4476044178009033, + "logps/chosen": -562.4835815429688, + "logps/rejected": -703.9220581054688, + "loss": 0.3617, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.087944269180298, + "rewards/margins": 1.636804223060608, + "rewards/rejected": -4.7247490882873535, + "step": 6830 + }, + { + "epoch": 0.9, + "grad_norm": 13.4375, + "learning_rate": 1.6591935299577227e-07, + "logits/chosen": 0.8055804967880249, + "logits/rejected": 1.5917952060699463, + "logps/chosen": -572.6065673828125, + "logps/rejected": -755.8960571289062, + "loss": 0.4138, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.3246803283691406, + "rewards/margins": 1.7669527530670166, + "rewards/rejected": -5.091632843017578, + "step": 6840 + }, + { + "epoch": 0.9, + "grad_norm": 15.6875, + "learning_rate": 1.6185246933894338e-07, + "logits/chosen": 0.25553420186042786, + "logits/rejected": 1.0945329666137695, + "logps/chosen": -616.6585693359375, + "logps/rejected": -763.8131713867188, + "loss": 0.3875, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.3123505115509033, + "rewards/margins": 1.6443926095962524, + "rewards/rejected": -4.956742763519287, + "step": 6850 + }, + { + "epoch": 0.9, + "grad_norm": 16.625, + "learning_rate": 1.5783438040455097e-07, + "logits/chosen": 0.12614001333713531, + "logits/rejected": 1.4914333820343018, + "logps/chosen": -577.5618286132812, + "logps/rejected": -689.0383911132812, + "loss": 0.3949, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0827829837799072, + "rewards/margins": 1.6236732006072998, + "rewards/rejected": -4.706456184387207, + "step": 6860 + }, + { + "epoch": 0.9, + "grad_norm": 10.6875, + "learning_rate": 1.538651700459576e-07, + "logits/chosen": 0.025020074099302292, + "logits/rejected": 0.9685807228088379, + "logps/chosen": -585.4603271484375, + "logps/rejected": -646.5584716796875, + "loss": 0.5232, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.115504503250122, + "rewards/margins": 1.0184857845306396, + "rewards/rejected": -4.133990287780762, + "step": 6870 + }, + { + "epoch": 0.9, + "grad_norm": 14.0, + "learning_rate": 1.4994492109648151e-07, + "logits/chosen": 0.2431701421737671, + "logits/rejected": 0.8176721334457397, + "logps/chosen": -597.099365234375, + "logps/rejected": -721.96337890625, + "loss": 0.4917, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.4174275398254395, + "rewards/margins": 1.211806058883667, + "rewards/rejected": -4.629233360290527, + "step": 6880 + }, + { + "epoch": 0.9, + "grad_norm": 8.5, + "learning_rate": 1.4607371536766695e-07, + "logits/chosen": 0.5689305067062378, + "logits/rejected": 1.2870957851409912, + "logps/chosen": -623.2686767578125, + "logps/rejected": -730.4143676757812, + "loss": 0.4955, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.4705052375793457, + "rewards/margins": 1.2539684772491455, + "rewards/rejected": -4.72447395324707, + "step": 6890 + }, + { + "epoch": 0.9, + "grad_norm": 12.8125, + "learning_rate": 1.4225163364757655e-07, + "logits/chosen": -0.0002982348087243736, + "logits/rejected": 1.2865327596664429, + "logps/chosen": -572.536865234375, + "logps/rejected": -690.8985595703125, + "loss": 0.4298, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1573309898376465, + "rewards/margins": 1.4404720067977905, + "rewards/rejected": -4.597803592681885, + "step": 6900 + }, + { + "epoch": 0.9, + "eval_logits/chosen": 1.0130374431610107, + "eval_logits/rejected": 1.7562763690948486, + "eval_logps/chosen": -594.4664306640625, + "eval_logps/rejected": -705.733154296875, + "eval_loss": 0.4889255464076996, + "eval_rewards/accuracies": 0.7505000233650208, + "eval_rewards/chosen": -3.2984538078308105, + "eval_rewards/margins": 1.3130884170532227, + "eval_rewards/rejected": -4.611542224884033, + "eval_runtime": 1591.8913, + "eval_samples_per_second": 1.256, + "eval_steps_per_second": 0.314, + "step": 6900 + }, + { + "epoch": 0.9, + "grad_norm": 20.5, + "learning_rate": 1.3847875569910462e-07, + "logits/chosen": 0.5036298632621765, + "logits/rejected": 1.5973215103149414, + "logps/chosen": -556.7360229492188, + "logps/rejected": -693.60009765625, + "loss": 0.4498, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.1470251083374023, + "rewards/margins": 1.5626558065414429, + "rewards/rejected": -4.709681034088135, + "step": 6910 + }, + { + "epoch": 0.91, + "grad_norm": 7.21875, + "learning_rate": 1.3475516025831552e-07, + "logits/chosen": 0.2578235864639282, + "logits/rejected": 1.1093521118164062, + "logps/chosen": -571.198974609375, + "logps/rejected": -716.4463500976562, + "loss": 0.4159, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.120373249053955, + "rewards/margins": 1.5449237823486328, + "rewards/rejected": -4.665297031402588, + "step": 6920 + }, + { + "epoch": 0.91, + "grad_norm": 8.5625, + "learning_rate": 1.310809250327974e-07, + "logits/chosen": 0.6653910875320435, + "logits/rejected": 0.986367404460907, + "logps/chosen": -549.1756591796875, + "logps/rejected": -682.2640991210938, + "loss": 0.5015, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.378861665725708, + "rewards/margins": 1.2142317295074463, + "rewards/rejected": -4.593092918395996, + "step": 6930 + }, + { + "epoch": 0.91, + "grad_norm": 38.5, + "learning_rate": 1.2745612670004153e-07, + "logits/chosen": 0.09889905154705048, + "logits/rejected": 1.0996273756027222, + "logps/chosen": -607.4309692382812, + "logps/rejected": -675.5977172851562, + "loss": 0.5235, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2739291191101074, + "rewards/margins": 1.0797994136810303, + "rewards/rejected": -4.353728294372559, + "step": 6940 + }, + { + "epoch": 0.91, + "grad_norm": 18.75, + "learning_rate": 1.2388084090584395e-07, + "logits/chosen": 0.2793113589286804, + "logits/rejected": 1.003348469734192, + "logps/chosen": -564.1325073242188, + "logps/rejected": -657.8829345703125, + "loss": 0.5175, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.3109850883483887, + "rewards/margins": 1.2358797788619995, + "rewards/rejected": -4.5468645095825195, + "step": 6950 + }, + { + "epoch": 0.91, + "grad_norm": 10.375, + "learning_rate": 1.2035514226272305e-07, + "logits/chosen": 0.45842522382736206, + "logits/rejected": 1.0828557014465332, + "logps/chosen": -558.7763061523438, + "logps/rejected": -696.6474609375, + "loss": 0.4225, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.125001907348633, + "rewards/margins": 1.363785743713379, + "rewards/rejected": -4.488787651062012, + "step": 6960 + }, + { + "epoch": 0.91, + "grad_norm": 12.75, + "learning_rate": 1.1687910434836607e-07, + "logits/chosen": 0.1126895397901535, + "logits/rejected": 1.165924310684204, + "logps/chosen": -629.3409423828125, + "logps/rejected": -714.7694091796875, + "loss": 0.446, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.295433759689331, + "rewards/margins": 1.2763313055038452, + "rewards/rejected": -4.571765422821045, + "step": 6970 + }, + { + "epoch": 0.91, + "grad_norm": 14.1875, + "learning_rate": 1.1345279970409128e-07, + "logits/chosen": 0.44308653473854065, + "logits/rejected": 0.8887012600898743, + "logps/chosen": -580.622802734375, + "logps/rejected": -748.8364868164062, + "loss": 0.4164, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9695210456848145, + "rewards/margins": 1.5574761629104614, + "rewards/rejected": -4.526997089385986, + "step": 6980 + }, + { + "epoch": 0.91, + "grad_norm": 13.9375, + "learning_rate": 1.1007629983333629e-07, + "logits/chosen": 0.2842895984649658, + "logits/rejected": 0.597855806350708, + "logps/chosen": -570.9032592773438, + "logps/rejected": -714.6888427734375, + "loss": 0.5177, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.316150665283203, + "rewards/margins": 1.3857309818267822, + "rewards/rejected": -4.7018818855285645, + "step": 6990 + }, + { + "epoch": 0.92, + "grad_norm": 10.6875, + "learning_rate": 1.067496752001626e-07, + "logits/chosen": 0.09682926535606384, + "logits/rejected": 0.9496966600418091, + "logps/chosen": -586.10888671875, + "logps/rejected": -670.243896484375, + "loss": 0.55, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.264702558517456, + "rewards/margins": 1.1133906841278076, + "rewards/rejected": -4.378093242645264, + "step": 7000 + }, + { + "epoch": 0.92, + "eval_logits/chosen": 1.0131945610046387, + "eval_logits/rejected": 1.7566964626312256, + "eval_logps/chosen": -594.5901489257812, + "eval_logps/rejected": -705.9526977539062, + "eval_loss": 0.4888876676559448, + "eval_rewards/accuracies": 0.7505000233650208, + "eval_rewards/chosen": -3.2996912002563477, + "eval_rewards/margins": 1.3140465021133423, + "eval_rewards/rejected": -4.613737106323242, + "eval_runtime": 1591.6581, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 7000 + }, + { + "epoch": 0.92, + "grad_norm": 20.75, + "learning_rate": 1.0347299522778909e-07, + "logits/chosen": 0.4687480032444, + "logits/rejected": 1.1106703281402588, + "logps/chosen": -580.3803100585938, + "logps/rejected": -705.0828247070312, + "loss": 0.5507, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.449733018875122, + "rewards/margins": 1.257815957069397, + "rewards/rejected": -4.707549095153809, + "step": 7010 + }, + { + "epoch": 0.92, + "grad_norm": 31.5, + "learning_rate": 1.0024632829713971e-07, + "logits/chosen": 0.4061342179775238, + "logits/rejected": 1.314821481704712, + "logps/chosen": -599.4215087890625, + "logps/rejected": -715.9715576171875, + "loss": 0.4931, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.3331425189971924, + "rewards/margins": 1.471777319908142, + "rewards/rejected": -4.804919242858887, + "step": 7020 + }, + { + "epoch": 0.92, + "grad_norm": 7.75, + "learning_rate": 9.706974174541889e-08, + "logits/chosen": 0.375336229801178, + "logits/rejected": 0.8008115887641907, + "logps/chosen": -570.6932983398438, + "logps/rejected": -681.4544677734375, + "loss": 0.476, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.402949571609497, + "rewards/margins": 1.1330082416534424, + "rewards/rejected": -4.5359578132629395, + "step": 7030 + }, + { + "epoch": 0.92, + "grad_norm": 21.125, + "learning_rate": 9.39433018647043e-08, + "logits/chosen": 0.37960708141326904, + "logits/rejected": 1.2306239604949951, + "logps/chosen": -596.1632080078125, + "logps/rejected": -692.4556884765625, + "loss": 0.4308, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.3076729774475098, + "rewards/margins": 1.342871904373169, + "rewards/rejected": -4.6505446434021, + "step": 7040 + }, + { + "epoch": 0.92, + "grad_norm": 7.03125, + "learning_rate": 9.086707390056543e-08, + "logits/chosen": 0.22423484921455383, + "logits/rejected": 0.9336155652999878, + "logps/chosen": -581.1544799804688, + "logps/rejected": -696.49609375, + "loss": 0.4646, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2052035331726074, + "rewards/margins": 1.3599494695663452, + "rewards/rejected": -4.565153121948242, + "step": 7050 + }, + { + "epoch": 0.92, + "grad_norm": 10.375, + "learning_rate": 8.784112205070083e-08, + "logits/chosen": 0.09238873422145844, + "logits/rejected": 1.2443900108337402, + "logps/chosen": -570.7318115234375, + "logps/rejected": -729.5119018554688, + "loss": 0.3718, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9837429523468018, + "rewards/margins": 1.624291181564331, + "rewards/rejected": -4.608033657073975, + "step": 7060 + }, + { + "epoch": 0.93, + "grad_norm": 18.5, + "learning_rate": 8.486550946359779e-08, + "logits/chosen": 0.5402406454086304, + "logits/rejected": 0.9340842962265015, + "logps/chosen": -619.3458251953125, + "logps/rejected": -691.1231689453125, + "loss": 0.4756, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.4807777404785156, + "rewards/margins": 1.2306329011917114, + "rewards/rejected": -4.7114105224609375, + "step": 7070 + }, + { + "epoch": 0.93, + "grad_norm": 22.875, + "learning_rate": 8.194029823721556e-08, + "logits/chosen": 0.4674089848995209, + "logits/rejected": 1.4463164806365967, + "logps/chosen": -595.6998901367188, + "logps/rejected": -724.3187255859375, + "loss": 0.5678, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.4917690753936768, + "rewards/margins": 1.3110939264297485, + "rewards/rejected": -4.802863121032715, + "step": 7080 + }, + { + "epoch": 0.93, + "grad_norm": 17.0, + "learning_rate": 7.906554941768896e-08, + "logits/chosen": 0.3445693254470825, + "logits/rejected": 0.8292980194091797, + "logps/chosen": -568.9578857421875, + "logps/rejected": -723.1265869140625, + "loss": 0.5519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.342053174972534, + "rewards/margins": 1.3594369888305664, + "rewards/rejected": -4.7014899253845215, + "step": 7090 + }, + { + "epoch": 0.93, + "grad_norm": 19.375, + "learning_rate": 7.624132299805575e-08, + "logits/chosen": 0.2580556273460388, + "logits/rejected": 1.3959585428237915, + "logps/chosen": -638.3900146484375, + "logps/rejected": -735.552978515625, + "loss": 0.4123, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3954243659973145, + "rewards/margins": 1.474463701248169, + "rewards/rejected": -4.869887828826904, + "step": 7100 + }, + { + "epoch": 0.93, + "eval_logits/chosen": 1.0150725841522217, + "eval_logits/rejected": 1.7585645914077759, + "eval_logps/chosen": -594.8818969726562, + "eval_logps/rejected": -706.2577514648438, + "eval_loss": 0.4888802468776703, + "eval_rewards/accuracies": 0.7515000104904175, + "eval_rewards/chosen": -3.302608013153076, + "eval_rewards/margins": 1.3141798973083496, + "eval_rewards/rejected": -4.616787910461426, + "eval_runtime": 1591.5021, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 7100 + }, + { + "epoch": 0.93, + "grad_norm": 12.375, + "learning_rate": 7.346767791700127e-08, + "logits/chosen": 0.40067845582962036, + "logits/rejected": 0.7876986861228943, + "logps/chosen": -565.1066284179688, + "logps/rejected": -656.0726318359375, + "loss": 0.6681, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.3934097290039062, + "rewards/margins": 0.9237259030342102, + "rewards/rejected": -4.317135810852051, + "step": 7110 + }, + { + "epoch": 0.93, + "grad_norm": 5.75, + "learning_rate": 7.07446720576327e-08, + "logits/chosen": 0.2647624611854553, + "logits/rejected": 0.5136129260063171, + "logps/chosen": -589.279296875, + "logps/rejected": -711.5758666992188, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.370063066482544, + "rewards/margins": 1.0963002443313599, + "rewards/rejected": -4.466362953186035, + "step": 7120 + }, + { + "epoch": 0.93, + "grad_norm": 9.0625, + "learning_rate": 6.807236224626701e-08, + "logits/chosen": 0.5471646189689636, + "logits/rejected": 0.7506142854690552, + "logps/chosen": -575.2327880859375, + "logps/rejected": -703.8402099609375, + "loss": 0.4122, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.049259901046753, + "rewards/margins": 1.5313215255737305, + "rewards/rejected": -4.5805816650390625, + "step": 7130 + }, + { + "epoch": 0.93, + "grad_norm": 5.90625, + "learning_rate": 6.545080425124888e-08, + "logits/chosen": 0.6009904146194458, + "logits/rejected": 1.5046106576919556, + "logps/chosen": -560.8460693359375, + "logps/rejected": -698.7578735351562, + "loss": 0.4021, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1606037616729736, + "rewards/margins": 1.6776975393295288, + "rewards/rejected": -4.838300704956055, + "step": 7140 + }, + { + "epoch": 0.94, + "grad_norm": 7.0, + "learning_rate": 6.288005278178382e-08, + "logits/chosen": 0.10890443623065948, + "logits/rejected": 1.0314215421676636, + "logps/chosen": -591.5215454101562, + "logps/rejected": -700.6073608398438, + "loss": 0.4078, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.2524211406707764, + "rewards/margins": 1.4063465595245361, + "rewards/rejected": -4.658768177032471, + "step": 7150 + }, + { + "epoch": 0.94, + "grad_norm": 14.6875, + "learning_rate": 6.036016148679825e-08, + "logits/chosen": 0.10704119503498077, + "logits/rejected": 1.2058827877044678, + "logps/chosen": -549.1723022460938, + "logps/rejected": -699.8775634765625, + "loss": 0.3788, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9703383445739746, + "rewards/margins": 1.7024837732315063, + "rewards/rejected": -4.672821998596191, + "step": 7160 + }, + { + "epoch": 0.94, + "grad_norm": 13.25, + "learning_rate": 5.7891182953819235e-08, + "logits/chosen": 0.6250897645950317, + "logits/rejected": 0.94916170835495, + "logps/chosen": -609.9509887695312, + "logps/rejected": -693.9453125, + "loss": 0.5902, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5461039543151855, + "rewards/margins": 1.0024358034133911, + "rewards/rejected": -4.548539638519287, + "step": 7170 + }, + { + "epoch": 0.94, + "grad_norm": 14.9375, + "learning_rate": 5.547316870787689e-08, + "logits/chosen": 0.0662744864821434, + "logits/rejected": 1.2459442615509033, + "logps/chosen": -588.7398681640625, + "logps/rejected": -661.6343994140625, + "loss": 0.5033, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.239532947540283, + "rewards/margins": 1.3408972024917603, + "rewards/rejected": -4.580430507659912, + "step": 7180 + }, + { + "epoch": 0.94, + "grad_norm": 13.625, + "learning_rate": 5.310616921042927e-08, + "logits/chosen": 0.13475020229816437, + "logits/rejected": 1.097031593322754, + "logps/chosen": -666.7498779296875, + "logps/rejected": -723.42236328125, + "loss": 0.4635, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3409552574157715, + "rewards/margins": 1.3054221868515015, + "rewards/rejected": -4.6463775634765625, + "step": 7190 + }, + { + "epoch": 0.94, + "grad_norm": 21.25, + "learning_rate": 5.079023385830939e-08, + "logits/chosen": 0.4373611509799957, + "logits/rejected": 1.603753685951233, + "logps/chosen": -611.4661865234375, + "logps/rejected": -711.8818359375, + "loss": 0.5207, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.7718963623046875, + "rewards/margins": 1.09917414188385, + "rewards/rejected": -4.871070384979248, + "step": 7200 + }, + { + "epoch": 0.94, + "eval_logits/chosen": 1.01255202293396, + "eval_logits/rejected": 1.7557493448257446, + "eval_logps/chosen": -595.11279296875, + "eval_logps/rejected": -706.500732421875, + "eval_loss": 0.48874008655548096, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -3.304917097091675, + "eval_rewards/margins": 1.3143013715744019, + "eval_rewards/rejected": -4.619218826293945, + "eval_runtime": 1591.398, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 7200 + }, + { + "epoch": 0.94, + "grad_norm": 22.125, + "learning_rate": 4.8525410982695476e-08, + "logits/chosen": 0.41507038474082947, + "logits/rejected": 1.2446469068527222, + "logps/chosen": -608.064453125, + "logps/rejected": -712.0289306640625, + "loss": 0.5465, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.3873703479766846, + "rewards/margins": 1.2731060981750488, + "rewards/rejected": -4.6604766845703125, + "step": 7210 + }, + { + "epoch": 0.94, + "grad_norm": 7.0, + "learning_rate": 4.6311747848099e-08, + "logits/chosen": -0.24930362403392792, + "logits/rejected": 1.102052927017212, + "logps/chosen": -652.6702270507812, + "logps/rejected": -732.0322875976562, + "loss": 0.5089, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.4032764434814453, + "rewards/margins": 1.2973884344100952, + "rewards/rejected": -4.700665473937988, + "step": 7220 + }, + { + "epoch": 0.95, + "grad_norm": 18.875, + "learning_rate": 4.4149290651382405e-08, + "logits/chosen": 0.3960798382759094, + "logits/rejected": 1.1941249370574951, + "logps/chosen": -602.910888671875, + "logps/rejected": -672.2167358398438, + "loss": 0.5062, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.242851972579956, + "rewards/margins": 1.2035329341888428, + "rewards/rejected": -4.446384429931641, + "step": 7230 + }, + { + "epoch": 0.95, + "grad_norm": 31.25, + "learning_rate": 4.203808452079211e-08, + "logits/chosen": 0.12929697334766388, + "logits/rejected": 0.7588415741920471, + "logps/chosen": -581.9747314453125, + "logps/rejected": -684.8924560546875, + "loss": 0.5113, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2949206829071045, + "rewards/margins": 1.1291944980621338, + "rewards/rejected": -4.424115180969238, + "step": 7240 + }, + { + "epoch": 0.95, + "grad_norm": 7.34375, + "learning_rate": 3.9978173515018427e-08, + "logits/chosen": 0.22393746674060822, + "logits/rejected": 0.6054302453994751, + "logps/chosen": -650.0538330078125, + "logps/rejected": -768.4112548828125, + "loss": 0.537, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.4565773010253906, + "rewards/margins": 1.2421902418136597, + "rewards/rejected": -4.69876766204834, + "step": 7250 + }, + { + "epoch": 0.95, + "grad_norm": 9.75, + "learning_rate": 3.7969600622274614e-08, + "logits/chosen": 0.7730494141578674, + "logits/rejected": 0.8779703378677368, + "logps/chosen": -616.2741088867188, + "logps/rejected": -757.5372314453125, + "loss": 0.5026, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5739378929138184, + "rewards/margins": 1.1644906997680664, + "rewards/rejected": -4.738428592681885, + "step": 7260 + }, + { + "epoch": 0.95, + "grad_norm": 11.875, + "learning_rate": 3.601240775940151e-08, + "logits/chosen": 0.0924004465341568, + "logits/rejected": 0.7291241884231567, + "logps/chosen": -585.1265869140625, + "logps/rejected": -715.2276611328125, + "loss": 0.442, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0802805423736572, + "rewards/margins": 1.3805367946624756, + "rewards/rejected": -4.460817337036133, + "step": 7270 + }, + { + "epoch": 0.95, + "grad_norm": 9.875, + "learning_rate": 3.410663577099071e-08, + "logits/chosen": 0.8634665608406067, + "logits/rejected": 1.24403715133667, + "logps/chosen": -625.5086059570312, + "logps/rejected": -712.8633422851562, + "loss": 0.6482, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.5116772651672363, + "rewards/margins": 1.0819501876831055, + "rewards/rejected": -4.593627452850342, + "step": 7280 + }, + { + "epoch": 0.95, + "grad_norm": 11.9375, + "learning_rate": 3.2252324428534986e-08, + "logits/chosen": 0.0949554294347763, + "logits/rejected": 1.7730538845062256, + "logps/chosen": -645.7748413085938, + "logps/rejected": -777.1041259765625, + "loss": 0.4368, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.345384120941162, + "rewards/margins": 1.7858293056488037, + "rewards/rejected": -5.131213188171387, + "step": 7290 + }, + { + "epoch": 0.96, + "grad_norm": 12.0625, + "learning_rate": 3.0449512429594486e-08, + "logits/chosen": 0.6525617837905884, + "logits/rejected": 1.2398340702056885, + "logps/chosen": -587.5560302734375, + "logps/rejected": -742.075927734375, + "loss": 0.4618, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.530935764312744, + "rewards/margins": 1.445012092590332, + "rewards/rejected": -4.975947856903076, + "step": 7300 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.0115752220153809, + "eval_logits/rejected": 1.7552307844161987, + "eval_logps/chosen": -594.8143310546875, + "eval_logps/rejected": -706.2247314453125, + "eval_loss": 0.48876285552978516, + "eval_rewards/accuracies": 0.7515000104904175, + "eval_rewards/chosen": -3.3019332885742188, + "eval_rewards/margins": 1.3145242929458618, + "eval_rewards/rejected": -4.616457462310791, + "eval_runtime": 1590.7324, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 7300 + }, + { + "epoch": 0.96, + "grad_norm": 7.6875, + "learning_rate": 2.8698237396992956e-08, + "logits/chosen": 0.09208633005619049, + "logits/rejected": 1.0006811618804932, + "logps/chosen": -608.3304443359375, + "logps/rejected": -736.591796875, + "loss": 0.4175, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.210888624191284, + "rewards/margins": 1.6330093145370483, + "rewards/rejected": -4.843898296356201, + "step": 7310 + }, + { + "epoch": 0.96, + "grad_norm": 11.5625, + "learning_rate": 2.6998535878030584e-08, + "logits/chosen": 0.43088826537132263, + "logits/rejected": 1.5745205879211426, + "logps/chosen": -615.910400390625, + "logps/rejected": -689.7060546875, + "loss": 0.5256, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.2760322093963623, + "rewards/margins": 1.1446540355682373, + "rewards/rejected": -4.4206862449646, + "step": 7320 + }, + { + "epoch": 0.96, + "grad_norm": 10.25, + "learning_rate": 2.535044334372072e-08, + "logits/chosen": 0.48195523023605347, + "logits/rejected": 1.243807077407837, + "logps/chosen": -566.0392456054688, + "logps/rejected": -699.8907470703125, + "loss": 0.5006, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.182238817214966, + "rewards/margins": 1.3754639625549316, + "rewards/rejected": -4.557702541351318, + "step": 7330 + }, + { + "epoch": 0.96, + "grad_norm": 7.84375, + "learning_rate": 2.3753994188051853e-08, + "logits/chosen": -0.08187036216259003, + "logits/rejected": 0.7814757227897644, + "logps/chosen": -654.4878540039062, + "logps/rejected": -722.797119140625, + "loss": 0.6864, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.5121543407440186, + "rewards/margins": 0.8491532206535339, + "rewards/rejected": -4.361307621002197, + "step": 7340 + }, + { + "epoch": 0.96, + "grad_norm": 26.125, + "learning_rate": 2.220922172726764e-08, + "logits/chosen": 0.5146197080612183, + "logits/rejected": 1.2585347890853882, + "logps/chosen": -602.0792236328125, + "logps/rejected": -677.6658935546875, + "loss": 0.6084, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.289280414581299, + "rewards/margins": 1.0045863389968872, + "rewards/rejected": -4.2938666343688965, + "step": 7350 + }, + { + "epoch": 0.96, + "grad_norm": 23.75, + "learning_rate": 2.071615819917244e-08, + "logits/chosen": 0.22979173064231873, + "logits/rejected": 1.6183487176895142, + "logps/chosen": -617.9466552734375, + "logps/rejected": -690.7569580078125, + "loss": 0.3736, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1348488330841064, + "rewards/margins": 1.4200165271759033, + "rewards/rejected": -4.55486536026001, + "step": 7360 + }, + { + "epoch": 0.96, + "grad_norm": 10.25, + "learning_rate": 1.9274834762459393e-08, + "logits/chosen": 0.6498265266418457, + "logits/rejected": 1.3851298093795776, + "logps/chosen": -553.2167358398438, + "logps/rejected": -704.5006103515625, + "loss": 0.4863, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.3176722526550293, + "rewards/margins": 1.5971444845199585, + "rewards/rejected": -4.914816856384277, + "step": 7370 + }, + { + "epoch": 0.97, + "grad_norm": 6.75, + "learning_rate": 1.7885281496058947e-08, + "logits/chosen": 0.7782396674156189, + "logits/rejected": 1.534798264503479, + "logps/chosen": -580.90478515625, + "logps/rejected": -677.3753662109375, + "loss": 0.5651, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2415664196014404, + "rewards/margins": 1.3312580585479736, + "rewards/rejected": -4.572824001312256, + "step": 7380 + }, + { + "epoch": 0.97, + "grad_norm": 28.125, + "learning_rate": 1.654752739851134e-08, + "logits/chosen": 0.4140414297580719, + "logits/rejected": 1.5184310674667358, + "logps/chosen": -642.1239013671875, + "logps/rejected": -669.8280029296875, + "loss": 0.537, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.256727695465088, + "rewards/margins": 1.1267528533935547, + "rewards/rejected": -4.383481025695801, + "step": 7390 + }, + { + "epoch": 0.97, + "grad_norm": 20.0, + "learning_rate": 1.526160038736235e-08, + "logits/chosen": 0.35390472412109375, + "logits/rejected": 1.5663673877716064, + "logps/chosen": -628.6697998046875, + "logps/rejected": -712.1144409179688, + "loss": 0.4826, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.2142441272735596, + "rewards/margins": 1.3702830076217651, + "rewards/rejected": -4.584527015686035, + "step": 7400 + }, + { + "epoch": 0.97, + "eval_logits/chosen": 1.0107791423797607, + "eval_logits/rejected": 1.7537970542907715, + "eval_logps/chosen": -594.97314453125, + "eval_logps/rejected": -706.3511962890625, + "eval_loss": 0.48887208104133606, + "eval_rewards/accuracies": 0.7509999871253967, + "eval_rewards/chosen": -3.303520679473877, + "eval_rewards/margins": 1.3142021894454956, + "eval_rewards/rejected": -4.617722988128662, + "eval_runtime": 1591.0726, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 7400 + }, + { + "epoch": 0.97, + "grad_norm": 31.125, + "learning_rate": 1.402752729857959e-08, + "logits/chosen": 0.4313858151435852, + "logits/rejected": 1.6850160360336304, + "logps/chosen": -596.0479125976562, + "logps/rejected": -672.6542358398438, + "loss": 0.5335, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.3667705059051514, + "rewards/margins": 1.3584734201431274, + "rewards/rejected": -4.725244045257568, + "step": 7410 + }, + { + "epoch": 0.97, + "grad_norm": 17.25, + "learning_rate": 1.2845333885992683e-08, + "logits/chosen": 0.5016809701919556, + "logits/rejected": 1.1508241891860962, + "logps/chosen": -554.3850708007812, + "logps/rejected": -715.5885009765625, + "loss": 0.4485, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.1848483085632324, + "rewards/margins": 1.6087548732757568, + "rewards/rejected": -4.79360294342041, + "step": 7420 + }, + { + "epoch": 0.97, + "grad_norm": 10.0, + "learning_rate": 1.171504482075675e-08, + "logits/chosen": 0.2105075567960739, + "logits/rejected": 1.2978436946868896, + "logps/chosen": -595.2945556640625, + "logps/rejected": -743.7322998046875, + "loss": 0.3998, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3599941730499268, + "rewards/margins": 1.502739667892456, + "rewards/rejected": -4.862734317779541, + "step": 7430 + }, + { + "epoch": 0.97, + "grad_norm": 20.625, + "learning_rate": 1.0636683690836147e-08, + "logits/chosen": 0.3214600086212158, + "logits/rejected": 1.136541485786438, + "logps/chosen": -607.3497314453125, + "logps/rejected": -740.9457397460938, + "loss": 0.4688, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1903975009918213, + "rewards/margins": 1.4278573989868164, + "rewards/rejected": -4.618254661560059, + "step": 7440 + }, + { + "epoch": 0.97, + "grad_norm": 21.625, + "learning_rate": 9.610273000513203e-09, + "logits/chosen": 0.7849928140640259, + "logits/rejected": 1.3118616342544556, + "logps/chosen": -529.127685546875, + "logps/rejected": -659.9454345703125, + "loss": 0.4025, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1130189895629883, + "rewards/margins": 1.4227343797683716, + "rewards/rejected": -4.5357537269592285, + "step": 7450 + }, + { + "epoch": 0.98, + "grad_norm": 12.75, + "learning_rate": 8.635834169918312e-09, + "logits/chosen": -0.15050581097602844, + "logits/rejected": 0.86566561460495, + "logps/chosen": -588.2030029296875, + "logps/rejected": -701.0858764648438, + "loss": 0.3991, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.156752109527588, + "rewards/margins": 1.3994879722595215, + "rewards/rejected": -4.556240081787109, + "step": 7460 + }, + { + "epoch": 0.98, + "grad_norm": 14.0, + "learning_rate": 7.713387534582506e-09, + "logits/chosen": 0.42161521315574646, + "logits/rejected": 0.6920258402824402, + "logps/chosen": -543.2078857421875, + "logps/rejected": -679.2174072265625, + "loss": 0.566, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.1884536743164062, + "rewards/margins": 1.1040431261062622, + "rewards/rejected": -4.292496681213379, + "step": 7470 + }, + { + "epoch": 0.98, + "grad_norm": 16.25, + "learning_rate": 6.84295234501392e-09, + "logits/chosen": 0.3836548924446106, + "logits/rejected": 0.7305153012275696, + "logps/chosen": -597.7833251953125, + "logps/rejected": -677.3738403320312, + "loss": 0.5487, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.133619785308838, + "rewards/margins": 1.116659164428711, + "rewards/rejected": -4.250278949737549, + "step": 7480 + }, + { + "epoch": 0.98, + "grad_norm": 23.0, + "learning_rate": 6.024546766295325e-09, + "logits/chosen": 0.15691399574279785, + "logits/rejected": 1.376615047454834, + "logps/chosen": -587.3973999023438, + "logps/rejected": -657.0569458007812, + "loss": 0.5322, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.09767746925354, + "rewards/margins": 1.212154507637024, + "rewards/rejected": -4.309831619262695, + "step": 7490 + }, + { + "epoch": 0.98, + "grad_norm": 9.3125, + "learning_rate": 5.2581878777049895e-09, + "logits/chosen": 0.4897291660308838, + "logits/rejected": 1.0999479293823242, + "logps/chosen": -582.9024658203125, + "logps/rejected": -714.1553344726562, + "loss": 0.3856, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.102313995361328, + "rewards/margins": 1.7228889465332031, + "rewards/rejected": -4.825202941894531, + "step": 7500 + }, + { + "epoch": 0.98, + "eval_logits/chosen": 1.011379361152649, + "eval_logits/rejected": 1.7544424533843994, + "eval_logps/chosen": -595.0473022460938, + "eval_logps/rejected": -706.4486083984375, + "eval_loss": 0.48872044682502747, + "eval_rewards/accuracies": 0.7515000104904175, + "eval_rewards/chosen": -3.30426287651062, + "eval_rewards/margins": 1.3144340515136719, + "eval_rewards/rejected": -4.618696689605713, + "eval_runtime": 1590.4964, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 7500 + }, + { + "epoch": 0.98, + "grad_norm": 13.3125, + "learning_rate": 4.543891672361411e-09, + "logits/chosen": -0.033980078995227814, + "logits/rejected": 1.000163197517395, + "logps/chosen": -629.8569946289062, + "logps/rejected": -730.5193481445312, + "loss": 0.4221, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1672351360321045, + "rewards/margins": 1.4683955907821655, + "rewards/rejected": -4.635631084442139, + "step": 7510 + }, + { + "epoch": 0.98, + "grad_norm": 10.25, + "learning_rate": 3.881673056887747e-09, + "logits/chosen": 0.13274362683296204, + "logits/rejected": 1.4995901584625244, + "logps/chosen": -599.1785888671875, + "logps/rejected": -693.8605346679688, + "loss": 0.428, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.077873945236206, + "rewards/margins": 1.6000537872314453, + "rewards/rejected": -4.677927494049072, + "step": 7520 + }, + { + "epoch": 0.99, + "grad_norm": 23.5, + "learning_rate": 3.2715458511023425e-09, + "logits/chosen": 0.3830450475215912, + "logits/rejected": 0.914400577545166, + "logps/chosen": -604.7692260742188, + "logps/rejected": -697.35205078125, + "loss": 0.587, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5337085723876953, + "rewards/margins": 0.94794762134552, + "rewards/rejected": -4.481656074523926, + "step": 7530 + }, + { + "epoch": 0.99, + "grad_norm": 13.6875, + "learning_rate": 2.7135227877289617e-09, + "logits/chosen": 0.5699089765548706, + "logits/rejected": 0.8099034428596497, + "logps/chosen": -581.1547241210938, + "logps/rejected": -740.4284057617188, + "loss": 0.45, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.312006711959839, + "rewards/margins": 1.4148095846176147, + "rewards/rejected": -4.726816654205322, + "step": 7540 + }, + { + "epoch": 0.99, + "grad_norm": 21.875, + "learning_rate": 2.2076155121328326e-09, + "logits/chosen": 0.5921443700790405, + "logits/rejected": 1.261713981628418, + "logps/chosen": -579.9949951171875, + "logps/rejected": -733.7827758789062, + "loss": 0.3957, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.289013385772705, + "rewards/margins": 1.5766642093658447, + "rewards/rejected": -4.865677833557129, + "step": 7550 + }, + { + "epoch": 0.99, + "grad_norm": 16.125, + "learning_rate": 1.7538345820755641e-09, + "logits/chosen": 0.4986654222011566, + "logits/rejected": 1.1568987369537354, + "logps/chosen": -579.21533203125, + "logps/rejected": -667.2409057617188, + "loss": 0.5121, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.2680678367614746, + "rewards/margins": 1.1933481693267822, + "rewards/rejected": -4.461415767669678, + "step": 7560 + }, + { + "epoch": 0.99, + "grad_norm": 14.5625, + "learning_rate": 1.3521894674961567e-09, + "logits/chosen": 0.21069379150867462, + "logits/rejected": 1.3896992206573486, + "logps/chosen": -589.5611572265625, + "logps/rejected": -748.1316528320312, + "loss": 0.4201, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2263946533203125, + "rewards/margins": 1.7248976230621338, + "rewards/rejected": -4.951291561126709, + "step": 7570 + }, + { + "epoch": 0.99, + "grad_norm": 14.75, + "learning_rate": 1.0026885503131023e-09, + "logits/chosen": 0.10079088062047958, + "logits/rejected": 1.2259687185287476, + "logps/chosen": -587.1923828125, + "logps/rejected": -704.1217041015625, + "loss": 0.4449, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.298996686935425, + "rewards/margins": 1.4512828588485718, + "rewards/rejected": -4.750279426574707, + "step": 7580 + }, + { + "epoch": 0.99, + "grad_norm": 16.375, + "learning_rate": 7.053391242492491e-10, + "logits/chosen": 0.2513376772403717, + "logits/rejected": 0.9676704406738281, + "logps/chosen": -625.3098754882812, + "logps/rejected": -719.0193481445312, + "loss": 0.5372, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.4935874938964844, + "rewards/margins": 1.0755943059921265, + "rewards/rejected": -4.5691819190979, + "step": 7590 + }, + { + "epoch": 0.99, + "grad_norm": 8.4375, + "learning_rate": 4.6014739467997725e-10, + "logits/chosen": 0.40907493233680725, + "logits/rejected": 0.9747940301895142, + "logps/chosen": -537.2282104492188, + "logps/rejected": -644.7360229492188, + "loss": 0.5369, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1004512310028076, + "rewards/margins": 1.0329300165176392, + "rewards/rejected": -4.133380889892578, + "step": 7600 + }, + { + "epoch": 0.99, + "eval_logits/chosen": 1.0126349925994873, + "eval_logits/rejected": 1.755886197090149, + "eval_logps/chosen": -594.9012451171875, + "eval_logps/rejected": -706.3289794921875, + "eval_loss": 0.4886167645454407, + "eval_rewards/accuracies": 0.7519999742507935, + "eval_rewards/chosen": -3.302802324295044, + "eval_rewards/margins": 1.3146986961364746, + "eval_rewards/rejected": -4.6175007820129395, + "eval_runtime": 1590.8922, + "eval_samples_per_second": 1.257, + "eval_steps_per_second": 0.314, + "step": 7600 + }, + { + "epoch": 1.0, + "grad_norm": 14.375, + "learning_rate": 2.671184785033032e-10, + "logits/chosen": 0.04215417057275772, + "logits/rejected": 1.0146968364715576, + "logps/chosen": -626.7977905273438, + "logps/rejected": -724.8660278320312, + "loss": 0.4751, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.042105197906494, + "rewards/margins": 1.3117393255233765, + "rewards/rejected": -4.353843688964844, + "step": 7610 + }, + { + "epoch": 1.0, + "grad_norm": 16.375, + "learning_rate": 1.2625640403302054e-10, + "logits/chosen": 0.33571863174438477, + "logits/rejected": 0.9231777191162109, + "logps/chosen": -619.7828369140625, + "logps/rejected": -719.2232666015625, + "loss": 0.5526, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.4431312084198, + "rewards/margins": 1.2789758443832397, + "rewards/rejected": -4.72210693359375, + "step": 7620 + }, + { + "epoch": 1.0, + "grad_norm": 8.625, + "learning_rate": 3.756411091515588e-11, + "logits/chosen": 0.18591374158859253, + "logits/rejected": 1.280806303024292, + "logps/chosen": -558.523681640625, + "logps/rejected": -691.3430786132812, + "loss": 0.4201, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0532612800598145, + "rewards/margins": 1.6573164463043213, + "rewards/rejected": -4.710577487945557, + "step": 7630 + }, + { + "epoch": 1.0, + "grad_norm": 7.5625, + "learning_rate": 1.0434500657963143e-12, + "logits/chosen": 0.10529494285583496, + "logits/rejected": 1.4016101360321045, + "logps/chosen": -626.654296875, + "logps/rejected": -703.9794921875, + "loss": 0.5513, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.382859706878662, + "rewards/margins": 1.3783118724822998, + "rewards/rejected": -4.761171817779541, + "step": 7640 + }, + { + "epoch": 1.0, + "step": 7642, + "total_flos": 0.0, + "train_loss": 0.5201432038994555, + "train_runtime": 240641.1375, + "train_samples_per_second": 0.254, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 10, + "max_steps": 7642, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}