{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 7642, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.21875, "learning_rate": 6.535947712418301e-09, "logits/chosen": -2.2813315391540527, "logits/rejected": -2.01680850982666, "logps/chosen": -216.2415771484375, "logps/rejected": -121.72990417480469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 2.78125, "learning_rate": 6.535947712418302e-08, "logits/chosen": -2.4226467609405518, "logits/rejected": -2.36716365814209, "logps/chosen": -281.0023498535156, "logps/rejected": -206.8296661376953, "loss": 0.6932, "rewards/accuracies": 0.375, "rewards/chosen": 0.00011049283784814179, "rewards/margins": -0.00019701628480106592, "rewards/rejected": 0.00030750909354537725, "step": 10 }, { "epoch": 0.0, "grad_norm": 3.265625, "learning_rate": 1.3071895424836603e-07, "logits/chosen": -2.4675583839416504, "logits/rejected": -2.504241466522217, "logps/chosen": -238.65823364257812, "logps/rejected": -216.17611694335938, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00041228701593354344, "rewards/margins": 8.625028567621484e-05, "rewards/rejected": 0.00032603665022179484, "step": 20 }, { "epoch": 0.0, "grad_norm": 3.453125, "learning_rate": 1.9607843137254904e-07, "logits/chosen": -2.4760632514953613, "logits/rejected": -2.4448161125183105, "logps/chosen": -226.6892547607422, "logps/rejected": -225.68603515625, "loss": 0.6929, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0022668808232992887, "rewards/margins": 0.0005866141873411834, "rewards/rejected": 0.0016802664613351226, "step": 30 }, { "epoch": 0.01, "grad_norm": 3.203125, "learning_rate": 2.6143790849673207e-07, "logits/chosen": -2.5491480827331543, "logits/rejected": -2.4344499111175537, "logps/chosen": -260.48663330078125, "logps/rejected": -297.4029235839844, "loss": 0.6923, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.007132339291274548, "rewards/margins": 0.0017693456029519439, "rewards/rejected": 0.005362994037568569, "step": 40 }, { "epoch": 0.01, "grad_norm": 3.5625, "learning_rate": 3.267973856209151e-07, "logits/chosen": -2.5332558155059814, "logits/rejected": -2.395725965499878, "logps/chosen": -268.88641357421875, "logps/rejected": -231.83193969726562, "loss": 0.6921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010614831931889057, "rewards/margins": 0.002073808806017041, "rewards/rejected": 0.00854102335870266, "step": 50 }, { "epoch": 0.01, "grad_norm": 3.328125, "learning_rate": 3.921568627450981e-07, "logits/chosen": -2.486255168914795, "logits/rejected": -2.4650115966796875, "logps/chosen": -304.43609619140625, "logps/rejected": -281.531005859375, "loss": 0.692, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0181833915412426, "rewards/margins": 0.0023597306571900845, "rewards/rejected": 0.015823662281036377, "step": 60 }, { "epoch": 0.01, "grad_norm": 2.453125, "learning_rate": 4.5751633986928105e-07, "logits/chosen": -2.468477249145508, "logits/rejected": -2.358503818511963, "logps/chosen": -274.7584228515625, "logps/rejected": -262.47967529296875, "loss": 0.6893, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.023093093186616898, "rewards/margins": 0.00769047299399972, "rewards/rejected": 0.015402620658278465, "step": 70 }, { "epoch": 0.01, "grad_norm": 3.578125, "learning_rate": 5.228758169934641e-07, "logits/chosen": -2.4944674968719482, "logits/rejected": -2.388322353363037, "logps/chosen": -281.0355529785156, "logps/rejected": -250.0665283203125, "loss": 0.6891, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.030989915132522583, "rewards/margins": 0.00816916860640049, "rewards/rejected": 0.022820744663476944, "step": 80 }, { "epoch": 0.01, "grad_norm": 2.90625, "learning_rate": 5.882352941176471e-07, "logits/chosen": -2.54004168510437, "logits/rejected": -2.4396867752075195, "logps/chosen": -264.7198181152344, "logps/rejected": -234.16793823242188, "loss": 0.69, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.036226507276296616, "rewards/margins": 0.006393597926944494, "rewards/rejected": 0.029832908883690834, "step": 90 }, { "epoch": 0.01, "grad_norm": 3.421875, "learning_rate": 6.535947712418302e-07, "logits/chosen": -2.6216228008270264, "logits/rejected": -2.5309205055236816, "logps/chosen": -265.4188537597656, "logps/rejected": -224.04837036132812, "loss": 0.6885, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04085296764969826, "rewards/margins": 0.009448667988181114, "rewards/rejected": 0.031404297798871994, "step": 100 }, { "epoch": 0.01, "eval_logits/chosen": -2.386389970779419, "eval_logits/rejected": -2.301281452178955, "eval_logps/chosen": -260.609619140625, "eval_logps/rejected": -241.47633361816406, "eval_loss": 0.6887126564979553, "eval_rewards/accuracies": 0.6154999732971191, "eval_rewards/chosen": 0.04011417552828789, "eval_rewards/margins": 0.009088212624192238, "eval_rewards/rejected": 0.031025957316160202, "eval_runtime": 1591.0686, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 100 }, { "epoch": 0.01, "grad_norm": 2.640625, "learning_rate": 7.189542483660131e-07, "logits/chosen": -2.4524688720703125, "logits/rejected": -2.3316445350646973, "logps/chosen": -242.79489135742188, "logps/rejected": -221.9463653564453, "loss": 0.6879, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04074569046497345, "rewards/margins": 0.010795327834784985, "rewards/rejected": 0.02995036169886589, "step": 110 }, { "epoch": 0.02, "grad_norm": 3.65625, "learning_rate": 7.843137254901962e-07, "logits/chosen": -2.6706340312957764, "logits/rejected": -2.5337891578674316, "logps/chosen": -304.76055908203125, "logps/rejected": -252.02218627929688, "loss": 0.6863, "rewards/accuracies": 0.625, "rewards/chosen": 0.04983269050717354, "rewards/margins": 0.014050389640033245, "rewards/rejected": 0.03578229993581772, "step": 120 }, { "epoch": 0.02, "grad_norm": 3.875, "learning_rate": 8.496732026143792e-07, "logits/chosen": -2.52778959274292, "logits/rejected": -2.411165714263916, "logps/chosen": -310.9256286621094, "logps/rejected": -255.0240478515625, "loss": 0.6857, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.050248682498931885, "rewards/margins": 0.01527202595025301, "rewards/rejected": 0.03497665375471115, "step": 130 }, { "epoch": 0.02, "grad_norm": 2.828125, "learning_rate": 9.150326797385621e-07, "logits/chosen": -2.5127735137939453, "logits/rejected": -2.426474094390869, "logps/chosen": -238.1068878173828, "logps/rejected": -235.57666015625, "loss": 0.6877, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.04580515995621681, "rewards/margins": 0.011373082175850868, "rewards/rejected": 0.034432075917720795, "step": 140 }, { "epoch": 0.02, "grad_norm": 2.875, "learning_rate": 9.80392156862745e-07, "logits/chosen": -2.514488697052002, "logits/rejected": -2.4246912002563477, "logps/chosen": -253.14688110351562, "logps/rejected": -230.44039916992188, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": 0.04800596088171005, "rewards/margins": 0.0169829074293375, "rewards/rejected": 0.0310230515897274, "step": 150 }, { "epoch": 0.02, "grad_norm": 3.609375, "learning_rate": 1.0457516339869283e-06, "logits/chosen": -2.6019513607025146, "logits/rejected": -2.48175048828125, "logps/chosen": -250.6589813232422, "logps/rejected": -221.22116088867188, "loss": 0.6851, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05127542465925217, "rewards/margins": 0.016976820304989815, "rewards/rejected": 0.0342986062169075, "step": 160 }, { "epoch": 0.02, "grad_norm": 2.796875, "learning_rate": 1.111111111111111e-06, "logits/chosen": -2.562800407409668, "logits/rejected": -2.4376182556152344, "logps/chosen": -264.79949951171875, "logps/rejected": -236.85531616210938, "loss": 0.683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05695644021034241, "rewards/margins": 0.02115512825548649, "rewards/rejected": 0.03580131381750107, "step": 170 }, { "epoch": 0.02, "grad_norm": 2.6875, "learning_rate": 1.1764705882352942e-06, "logits/chosen": -2.595853567123413, "logits/rejected": -2.445687770843506, "logps/chosen": -253.2369842529297, "logps/rejected": -239.16470336914062, "loss": 0.6793, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.05581844970583916, "rewards/margins": 0.029622327536344528, "rewards/rejected": 0.026196125894784927, "step": 180 }, { "epoch": 0.02, "grad_norm": 3.59375, "learning_rate": 1.2418300653594772e-06, "logits/chosen": -2.59649395942688, "logits/rejected": -2.5004138946533203, "logps/chosen": -260.62255859375, "logps/rejected": -233.1819610595703, "loss": 0.6771, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05944965034723282, "rewards/margins": 0.03332919254899025, "rewards/rejected": 0.02612045407295227, "step": 190 }, { "epoch": 0.03, "grad_norm": 3.203125, "learning_rate": 1.3071895424836604e-06, "logits/chosen": -2.5189108848571777, "logits/rejected": -2.487696409225464, "logps/chosen": -228.7367401123047, "logps/rejected": -243.255859375, "loss": 0.6826, "rewards/accuracies": 0.625, "rewards/chosen": 0.04864143207669258, "rewards/margins": 0.02205492928624153, "rewards/rejected": 0.02658650279045105, "step": 200 }, { "epoch": 0.03, "eval_logits/chosen": -2.379215955734253, "eval_logits/rejected": -2.2938711643218994, "eval_logps/chosen": -259.2414855957031, "eval_logps/rejected": -242.49423217773438, "eval_loss": 0.6777375936508179, "eval_rewards/accuracies": 0.6554999947547913, "eval_rewards/chosen": 0.05379528924822807, "eval_rewards/margins": 0.03294837102293968, "eval_rewards/rejected": 0.02084691822528839, "eval_runtime": 1592.4448, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 200 }, { "epoch": 0.03, "grad_norm": 3.4375, "learning_rate": 1.3725490196078434e-06, "logits/chosen": -2.5041604042053223, "logits/rejected": -2.4234907627105713, "logps/chosen": -257.89825439453125, "logps/rejected": -264.8030090332031, "loss": 0.6747, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.049550481140613556, "rewards/margins": 0.038632601499557495, "rewards/rejected": 0.01091787964105606, "step": 210 }, { "epoch": 0.03, "grad_norm": 2.65625, "learning_rate": 1.4379084967320261e-06, "logits/chosen": -2.511364698410034, "logits/rejected": -2.390717029571533, "logps/chosen": -214.1625213623047, "logps/rejected": -208.02676391601562, "loss": 0.6712, "rewards/accuracies": 0.75, "rewards/chosen": 0.047859933227300644, "rewards/margins": 0.046445488929748535, "rewards/rejected": 0.0014144459273666143, "step": 220 }, { "epoch": 0.03, "grad_norm": 3.078125, "learning_rate": 1.5032679738562091e-06, "logits/chosen": -2.6636946201324463, "logits/rejected": -2.510026454925537, "logps/chosen": -330.6913146972656, "logps/rejected": -265.2884216308594, "loss": 0.6669, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.06970520317554474, "rewards/margins": 0.0569244809448719, "rewards/rejected": 0.012780720368027687, "step": 230 }, { "epoch": 0.03, "grad_norm": 3.875, "learning_rate": 1.5686274509803923e-06, "logits/chosen": -2.523871421813965, "logits/rejected": -2.399880886077881, "logps/chosen": -249.8741455078125, "logps/rejected": -195.24508666992188, "loss": 0.6542, "rewards/accuracies": 0.75, "rewards/chosen": 0.05200767517089844, "rewards/margins": 0.08412192761898041, "rewards/rejected": -0.03211425989866257, "step": 240 }, { "epoch": 0.03, "grad_norm": 3.546875, "learning_rate": 1.6339869281045753e-06, "logits/chosen": -2.488612174987793, "logits/rejected": -2.384747266769409, "logps/chosen": -273.90985107421875, "logps/rejected": -223.31674194335938, "loss": 0.6547, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0524408221244812, "rewards/margins": 0.08312702178955078, "rewards/rejected": -0.03068619966506958, "step": 250 }, { "epoch": 0.03, "grad_norm": 3.96875, "learning_rate": 1.6993464052287585e-06, "logits/chosen": -2.437614679336548, "logits/rejected": -2.3234875202178955, "logps/chosen": -271.07061767578125, "logps/rejected": -242.3330535888672, "loss": 0.6587, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.012964209541678429, "rewards/margins": 0.07740563899278641, "rewards/rejected": -0.06444142013788223, "step": 260 }, { "epoch": 0.04, "grad_norm": 3.53125, "learning_rate": 1.7647058823529414e-06, "logits/chosen": -2.459193468093872, "logits/rejected": -2.414381980895996, "logps/chosen": -232.7344970703125, "logps/rejected": -245.03836059570312, "loss": 0.6609, "rewards/accuracies": 0.6875, "rewards/chosen": -0.025754287838935852, "rewards/margins": 0.07392101734876633, "rewards/rejected": -0.09967531263828278, "step": 270 }, { "epoch": 0.04, "grad_norm": 3.546875, "learning_rate": 1.8300653594771242e-06, "logits/chosen": -2.3779327869415283, "logits/rejected": -2.3616323471069336, "logps/chosen": -235.00436401367188, "logps/rejected": -238.427978515625, "loss": 0.6686, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0035261516459286213, "rewards/margins": 0.06117261201143265, "rewards/rejected": -0.05764646455645561, "step": 280 }, { "epoch": 0.04, "grad_norm": 3.84375, "learning_rate": 1.8954248366013072e-06, "logits/chosen": -2.4707655906677246, "logits/rejected": -2.3777573108673096, "logps/chosen": -292.0356140136719, "logps/rejected": -266.07550048828125, "loss": 0.6614, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.013245267793536186, "rewards/margins": 0.07698690891265869, "rewards/rejected": -0.06374163925647736, "step": 290 }, { "epoch": 0.04, "grad_norm": 4.34375, "learning_rate": 1.96078431372549e-06, "logits/chosen": -2.4952127933502197, "logits/rejected": -2.3478877544403076, "logps/chosen": -272.2964172363281, "logps/rejected": -271.04937744140625, "loss": 0.6623, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06571820378303528, "rewards/margins": 0.07633010298013687, "rewards/rejected": -0.14204831421375275, "step": 300 }, { "epoch": 0.04, "eval_logits/chosen": -2.3202240467071533, "eval_logits/rejected": -2.2310445308685303, "eval_logps/chosen": -273.9336853027344, "eval_logps/rejected": -262.1588439941406, "eval_loss": 0.6578417420387268, "eval_rewards/accuracies": 0.6735000014305115, "eval_rewards/chosen": -0.09312662482261658, "eval_rewards/margins": 0.08267267793416977, "eval_rewards/rejected": -0.17579929530620575, "eval_runtime": 1595.4881, "eval_samples_per_second": 1.254, "eval_steps_per_second": 0.313, "step": 300 }, { "epoch": 0.04, "grad_norm": 4.8125, "learning_rate": 2.0261437908496734e-06, "logits/chosen": -2.4691600799560547, "logits/rejected": -2.4113526344299316, "logps/chosen": -294.3885803222656, "logps/rejected": -286.75103759765625, "loss": 0.6492, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.05116035416722298, "rewards/margins": 0.10192099958658218, "rewards/rejected": -0.15308134257793427, "step": 310 }, { "epoch": 0.04, "grad_norm": 4.78125, "learning_rate": 2.0915032679738565e-06, "logits/chosen": -2.4071547985076904, "logits/rejected": -2.2507896423339844, "logps/chosen": -257.3243713378906, "logps/rejected": -214.6096954345703, "loss": 0.6449, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00719846785068512, "rewards/margins": 0.11425099521875381, "rewards/rejected": -0.10705254226922989, "step": 320 }, { "epoch": 0.04, "grad_norm": 4.125, "learning_rate": 2.1568627450980393e-06, "logits/chosen": -2.530496120452881, "logits/rejected": -2.3858256340026855, "logps/chosen": -315.06976318359375, "logps/rejected": -302.36773681640625, "loss": 0.639, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.06672513484954834, "rewards/margins": 0.12611213326454163, "rewards/rejected": -0.19283726811408997, "step": 330 }, { "epoch": 0.04, "grad_norm": 4.84375, "learning_rate": 2.222222222222222e-06, "logits/chosen": -2.3970444202423096, "logits/rejected": -2.2407002449035645, "logps/chosen": -263.9761657714844, "logps/rejected": -242.84762573242188, "loss": 0.6657, "rewards/accuracies": 0.625, "rewards/chosen": -0.21234241127967834, "rewards/margins": 0.06845516711473465, "rewards/rejected": -0.2807976007461548, "step": 340 }, { "epoch": 0.05, "grad_norm": 4.125, "learning_rate": 2.2875816993464053e-06, "logits/chosen": -2.485644578933716, "logits/rejected": -2.405118703842163, "logps/chosen": -264.986328125, "logps/rejected": -254.9670867919922, "loss": 0.6408, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06968159228563309, "rewards/margins": 0.12432824075222015, "rewards/rejected": -0.19400982558727264, "step": 350 }, { "epoch": 0.05, "grad_norm": 3.828125, "learning_rate": 2.3529411764705885e-06, "logits/chosen": -2.366410732269287, "logits/rejected": -2.3023273944854736, "logps/chosen": -270.1907653808594, "logps/rejected": -266.2153015136719, "loss": 0.6524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.003988529555499554, "rewards/margins": 0.09547580033540726, "rewards/rejected": -0.09148726612329483, "step": 360 }, { "epoch": 0.05, "grad_norm": 4.15625, "learning_rate": 2.4183006535947716e-06, "logits/chosen": -2.369810104370117, "logits/rejected": -2.2653660774230957, "logps/chosen": -234.8569793701172, "logps/rejected": -219.356689453125, "loss": 0.6548, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.0072977119125425816, "rewards/margins": 0.0890003889799118, "rewards/rejected": -0.09629810601472855, "step": 370 }, { "epoch": 0.05, "grad_norm": 5.3125, "learning_rate": 2.4836601307189544e-06, "logits/chosen": -2.3556571006774902, "logits/rejected": -2.259316921234131, "logps/chosen": -281.4800109863281, "logps/rejected": -286.7106628417969, "loss": 0.6479, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0648920014500618, "rewards/margins": 0.11231891065835953, "rewards/rejected": -0.17721091210842133, "step": 380 }, { "epoch": 0.05, "grad_norm": 4.75, "learning_rate": 2.549019607843137e-06, "logits/chosen": -2.312218189239502, "logits/rejected": -2.1933465003967285, "logps/chosen": -274.30120849609375, "logps/rejected": -267.49444580078125, "loss": 0.645, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11016625165939331, "rewards/margins": 0.12351493537425995, "rewards/rejected": -0.23368120193481445, "step": 390 }, { "epoch": 0.05, "grad_norm": 9.875, "learning_rate": 2.6143790849673208e-06, "logits/chosen": -2.315176248550415, "logits/rejected": -2.2708230018615723, "logps/chosen": -243.77029418945312, "logps/rejected": -249.6044921875, "loss": 0.6619, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.26674699783325195, "rewards/margins": 0.08755677193403244, "rewards/rejected": -0.354303777217865, "step": 400 }, { "epoch": 0.05, "eval_logits/chosen": -2.14410400390625, "eval_logits/rejected": -2.030946969985962, "eval_logps/chosen": -294.5643615722656, "eval_logps/rejected": -286.9753723144531, "eval_loss": 0.6454918384552002, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": -0.29943349957466125, "eval_rewards/margins": 0.12453118711709976, "eval_rewards/rejected": -0.4239646792411804, "eval_runtime": 1594.2847, "eval_samples_per_second": 1.254, "eval_steps_per_second": 0.314, "step": 400 }, { "epoch": 0.05, "grad_norm": 4.03125, "learning_rate": 2.6797385620915036e-06, "logits/chosen": -2.3303027153015137, "logits/rejected": -2.211501359939575, "logps/chosen": -266.9333190917969, "logps/rejected": -248.0369873046875, "loss": 0.6583, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34348687529563904, "rewards/margins": 0.08375416696071625, "rewards/rejected": -0.4272410273551941, "step": 410 }, { "epoch": 0.05, "grad_norm": 4.5625, "learning_rate": 2.7450980392156867e-06, "logits/chosen": -2.3430421352386475, "logits/rejected": -2.0668082237243652, "logps/chosen": -336.4903564453125, "logps/rejected": -275.1713562011719, "loss": 0.6576, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.41029343008995056, "rewards/margins": 0.09131225943565369, "rewards/rejected": -0.501605749130249, "step": 420 }, { "epoch": 0.06, "grad_norm": 4.59375, "learning_rate": 2.8104575163398695e-06, "logits/chosen": -2.378570079803467, "logits/rejected": -2.128157138824463, "logps/chosen": -343.38433837890625, "logps/rejected": -285.4502258300781, "loss": 0.6281, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.38465428352355957, "rewards/margins": 0.15774312615394592, "rewards/rejected": -0.5423974394798279, "step": 430 }, { "epoch": 0.06, "grad_norm": 4.09375, "learning_rate": 2.8758169934640523e-06, "logits/chosen": -2.3335065841674805, "logits/rejected": -2.3891799449920654, "logps/chosen": -266.10101318359375, "logps/rejected": -275.11773681640625, "loss": 0.6645, "rewards/accuracies": 0.625, "rewards/chosen": -0.3828270733356476, "rewards/margins": 0.08971767127513885, "rewards/rejected": -0.47254472970962524, "step": 440 }, { "epoch": 0.06, "grad_norm": 4.65625, "learning_rate": 2.9411764705882355e-06, "logits/chosen": -2.360802412033081, "logits/rejected": -2.125321865081787, "logps/chosen": -305.62811279296875, "logps/rejected": -278.4323425292969, "loss": 0.6356, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3769600987434387, "rewards/margins": 0.14625975489616394, "rewards/rejected": -0.523219883441925, "step": 450 }, { "epoch": 0.06, "grad_norm": 5.28125, "learning_rate": 3.0065359477124182e-06, "logits/chosen": -2.2305850982666016, "logits/rejected": -2.1131339073181152, "logps/chosen": -325.5186462402344, "logps/rejected": -308.9561767578125, "loss": 0.6336, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.43926286697387695, "rewards/margins": 0.1619986593723297, "rewards/rejected": -0.6012614965438843, "step": 460 }, { "epoch": 0.06, "grad_norm": 5.28125, "learning_rate": 3.071895424836602e-06, "logits/chosen": -2.356358051300049, "logits/rejected": -2.2044506072998047, "logps/chosen": -292.234130859375, "logps/rejected": -312.48565673828125, "loss": 0.6471, "rewards/accuracies": 0.625, "rewards/chosen": -0.4053603708744049, "rewards/margins": 0.1308523416519165, "rewards/rejected": -0.5362127423286438, "step": 470 }, { "epoch": 0.06, "grad_norm": 7.1875, "learning_rate": 3.1372549019607846e-06, "logits/chosen": -2.2809040546417236, "logits/rejected": -2.200786590576172, "logps/chosen": -255.33657836914062, "logps/rejected": -276.2983093261719, "loss": 0.6495, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2522006928920746, "rewards/margins": 0.13950569927692413, "rewards/rejected": -0.3917064070701599, "step": 480 }, { "epoch": 0.06, "grad_norm": 5.28125, "learning_rate": 3.2026143790849674e-06, "logits/chosen": -2.4270236492156982, "logits/rejected": -2.225928783416748, "logps/chosen": -251.0017547607422, "logps/rejected": -223.0763397216797, "loss": 0.6329, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2193588763475418, "rewards/margins": 0.17358329892158508, "rewards/rejected": -0.3929421603679657, "step": 490 }, { "epoch": 0.07, "grad_norm": 5.1875, "learning_rate": 3.2679738562091506e-06, "logits/chosen": -2.4462802410125732, "logits/rejected": -2.2391388416290283, "logps/chosen": -285.2283935546875, "logps/rejected": -278.61834716796875, "loss": 0.6257, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.30025702714920044, "rewards/margins": 0.18700367212295532, "rewards/rejected": -0.4872606694698334, "step": 500 }, { "epoch": 0.07, "eval_logits/chosen": -2.1485366821289062, "eval_logits/rejected": -2.0400145053863525, "eval_logps/chosen": -299.84417724609375, "eval_logps/rejected": -300.69671630859375, "eval_loss": 0.6193849444389343, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": -0.35223132371902466, "eval_rewards/margins": 0.20894668996334076, "eval_rewards/rejected": -0.5611779689788818, "eval_runtime": 1595.3344, "eval_samples_per_second": 1.254, "eval_steps_per_second": 0.313, "step": 500 }, { "epoch": 0.07, "grad_norm": 6.3125, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -2.2720694541931152, "logits/rejected": -2.0909745693206787, "logps/chosen": -319.68511962890625, "logps/rejected": -312.24517822265625, "loss": 0.6034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3121896982192993, "rewards/margins": 0.25317278504371643, "rewards/rejected": -0.5653624534606934, "step": 510 }, { "epoch": 0.07, "grad_norm": 5.65625, "learning_rate": 3.398692810457517e-06, "logits/chosen": -2.4018216133117676, "logits/rejected": -2.334257125854492, "logps/chosen": -304.389404296875, "logps/rejected": -286.22607421875, "loss": 0.6273, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.311653196811676, "rewards/margins": 0.19162270426750183, "rewards/rejected": -0.503275990486145, "step": 520 }, { "epoch": 0.07, "grad_norm": 5.875, "learning_rate": 3.4640522875816997e-06, "logits/chosen": -2.320746421813965, "logits/rejected": -2.172206401824951, "logps/chosen": -301.77679443359375, "logps/rejected": -298.7806701660156, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": -0.4527795910835266, "rewards/margins": 0.2991865277290344, "rewards/rejected": -0.751966118812561, "step": 530 }, { "epoch": 0.07, "grad_norm": 5.6875, "learning_rate": 3.529411764705883e-06, "logits/chosen": -2.257561445236206, "logits/rejected": -2.160403251647949, "logps/chosen": -344.9192199707031, "logps/rejected": -380.3932189941406, "loss": 0.5383, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6207869648933411, "rewards/margins": 0.4684711992740631, "rewards/rejected": -1.0892581939697266, "step": 540 }, { "epoch": 0.07, "grad_norm": 6.0, "learning_rate": 3.5947712418300657e-06, "logits/chosen": -2.241232395172119, "logits/rejected": -1.9481990337371826, "logps/chosen": -365.34429931640625, "logps/rejected": -332.47369384765625, "loss": 0.6163, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5884149670600891, "rewards/margins": 0.3151562809944153, "rewards/rejected": -0.9035712480545044, "step": 550 }, { "epoch": 0.07, "grad_norm": 5.3125, "learning_rate": 3.6601307189542484e-06, "logits/chosen": -2.107689142227173, "logits/rejected": -2.058168888092041, "logps/chosen": -286.3569641113281, "logps/rejected": -329.6611633300781, "loss": 0.5914, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5777391791343689, "rewards/margins": 0.3210974335670471, "rewards/rejected": -0.8988364934921265, "step": 560 }, { "epoch": 0.07, "grad_norm": 8.125, "learning_rate": 3.7254901960784316e-06, "logits/chosen": -2.1557562351226807, "logits/rejected": -2.2561655044555664, "logps/chosen": -304.96063232421875, "logps/rejected": -351.2149353027344, "loss": 0.5715, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4714527726173401, "rewards/margins": 0.3753376305103302, "rewards/rejected": -0.8467904329299927, "step": 570 }, { "epoch": 0.08, "grad_norm": 12.125, "learning_rate": 3.7908496732026144e-06, "logits/chosen": -1.9725488424301147, "logits/rejected": -1.8165016174316406, "logps/chosen": -353.0700378417969, "logps/rejected": -351.30902099609375, "loss": 0.5807, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5593926310539246, "rewards/margins": 0.3593262732028961, "rewards/rejected": -0.9187189340591431, "step": 580 }, { "epoch": 0.08, "grad_norm": 19.75, "learning_rate": 3.856209150326798e-06, "logits/chosen": -1.9927419424057007, "logits/rejected": -1.7365013360977173, "logps/chosen": -356.13531494140625, "logps/rejected": -363.0181579589844, "loss": 0.6351, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7341691851615906, "rewards/margins": 0.30337682366371155, "rewards/rejected": -1.037545919418335, "step": 590 }, { "epoch": 0.08, "grad_norm": 5.15625, "learning_rate": 3.92156862745098e-06, "logits/chosen": -1.9878699779510498, "logits/rejected": -1.9630234241485596, "logps/chosen": -298.11883544921875, "logps/rejected": -349.0447692871094, "loss": 0.6114, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5508363842964172, "rewards/margins": 0.29506638646125793, "rewards/rejected": -0.8459027409553528, "step": 600 }, { "epoch": 0.08, "eval_logits/chosen": -1.719992756843567, "eval_logits/rejected": -1.550325632095337, "eval_logps/chosen": -327.6963806152344, "eval_logps/rejected": -340.6011962890625, "eval_loss": 0.6003695130348206, "eval_rewards/accuracies": 0.6754999756813049, "eval_rewards/chosen": -0.6307531595230103, "eval_rewards/margins": 0.3294694721698761, "eval_rewards/rejected": -0.960222601890564, "eval_runtime": 1594.9578, "eval_samples_per_second": 1.254, "eval_steps_per_second": 0.313, "step": 600 }, { "epoch": 0.08, "grad_norm": 9.4375, "learning_rate": 3.986928104575164e-06, "logits/chosen": -2.0066254138946533, "logits/rejected": -1.6175451278686523, "logps/chosen": -334.5193786621094, "logps/rejected": -315.98345947265625, "loss": 0.594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7500374913215637, "rewards/margins": 0.318537175655365, "rewards/rejected": -1.0685746669769287, "step": 610 }, { "epoch": 0.08, "grad_norm": 7.46875, "learning_rate": 4.052287581699347e-06, "logits/chosen": -1.4493825435638428, "logits/rejected": -1.4420878887176514, "logps/chosen": -362.75543212890625, "logps/rejected": -412.0641174316406, "loss": 0.5098, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9031947255134583, "rewards/margins": 0.5772475600242615, "rewards/rejected": -1.4804422855377197, "step": 620 }, { "epoch": 0.08, "grad_norm": 13.75, "learning_rate": 4.11764705882353e-06, "logits/chosen": -1.7892284393310547, "logits/rejected": -1.3057540655136108, "logps/chosen": -373.5083923339844, "logps/rejected": -370.4551086425781, "loss": 0.6099, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9319362640380859, "rewards/margins": 0.3402162194252014, "rewards/rejected": -1.2721525430679321, "step": 630 }, { "epoch": 0.08, "grad_norm": 6.375, "learning_rate": 4.183006535947713e-06, "logits/chosen": -1.5417096614837646, "logits/rejected": -1.0605213642120361, "logps/chosen": -311.5467529296875, "logps/rejected": -309.4730224609375, "loss": 0.6052, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7302576899528503, "rewards/margins": 0.36248451471328735, "rewards/rejected": -1.0927422046661377, "step": 640 }, { "epoch": 0.09, "grad_norm": 6.75, "learning_rate": 4.2483660130718954e-06, "logits/chosen": -1.3266146183013916, "logits/rejected": -0.8559074401855469, "logps/chosen": -351.74542236328125, "logps/rejected": -339.4545593261719, "loss": 0.5848, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7287541627883911, "rewards/margins": 0.41469335556030273, "rewards/rejected": -1.1434475183486938, "step": 650 }, { "epoch": 0.09, "grad_norm": 8.5625, "learning_rate": 4.313725490196079e-06, "logits/chosen": -1.0535982847213745, "logits/rejected": -0.5663085579872131, "logps/chosen": -340.8390197753906, "logps/rejected": -378.7320556640625, "loss": 0.5298, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7466461062431335, "rewards/margins": 0.5064037442207336, "rewards/rejected": -1.2530498504638672, "step": 660 }, { "epoch": 0.09, "grad_norm": 10.375, "learning_rate": 4.379084967320262e-06, "logits/chosen": -0.47771701216697693, "logits/rejected": -0.5111185312271118, "logps/chosen": -349.4082946777344, "logps/rejected": -397.6182556152344, "loss": 0.5864, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.225978136062622, "rewards/margins": 0.3780650496482849, "rewards/rejected": -1.6040430068969727, "step": 670 }, { "epoch": 0.09, "grad_norm": 6.84375, "learning_rate": 4.444444444444444e-06, "logits/chosen": -0.7972174286842346, "logits/rejected": -0.40817341208457947, "logps/chosen": -455.7498474121094, "logps/rejected": -472.62994384765625, "loss": 0.5718, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6846425533294678, "rewards/margins": 0.46953901648521423, "rewards/rejected": -2.154181480407715, "step": 680 }, { "epoch": 0.09, "grad_norm": 9.5625, "learning_rate": 4.509803921568628e-06, "logits/chosen": -0.33812543749809265, "logits/rejected": -0.39494821429252625, "logps/chosen": -400.83795166015625, "logps/rejected": -469.98486328125, "loss": 0.5862, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6113307476043701, "rewards/margins": 0.44311681389808655, "rewards/rejected": -2.054447650909424, "step": 690 }, { "epoch": 0.09, "grad_norm": 11.125, "learning_rate": 4.5751633986928105e-06, "logits/chosen": -1.0038961172103882, "logits/rejected": -0.4275578558444977, "logps/chosen": -435.5682678222656, "logps/rejected": -460.2854919433594, "loss": 0.5394, "rewards/accuracies": 0.75, "rewards/chosen": -1.5751301050186157, "rewards/margins": 0.5817869901657104, "rewards/rejected": -2.156917095184326, "step": 700 }, { "epoch": 0.09, "eval_logits/chosen": -0.9309160709381104, "eval_logits/rejected": -0.6532166600227356, "eval_logps/chosen": -421.52081298828125, "eval_logps/rejected": -443.0096130371094, "eval_loss": 0.6103234887123108, "eval_rewards/accuracies": 0.6635000109672546, "eval_rewards/chosen": -1.5689976215362549, "eval_rewards/margins": 0.4153095483779907, "eval_rewards/rejected": -1.9843071699142456, "eval_runtime": 1595.2265, "eval_samples_per_second": 1.254, "eval_steps_per_second": 0.313, "step": 700 }, { "epoch": 0.09, "grad_norm": 10.5625, "learning_rate": 4.640522875816994e-06, "logits/chosen": -1.3852497339248657, "logits/rejected": -1.0913816690444946, "logps/chosen": -403.0024108886719, "logps/rejected": -451.954833984375, "loss": 0.4959, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.197796106338501, "rewards/margins": 0.7116587162017822, "rewards/rejected": -1.9094547033309937, "step": 710 }, { "epoch": 0.09, "grad_norm": 12.125, "learning_rate": 4.705882352941177e-06, "logits/chosen": -1.3930566310882568, "logits/rejected": -0.8486838340759277, "logps/chosen": -390.4463195800781, "logps/rejected": -396.0668029785156, "loss": 0.6395, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0593743324279785, "rewards/margins": 0.36317670345306396, "rewards/rejected": -1.4225510358810425, "step": 720 }, { "epoch": 0.1, "grad_norm": 10.8125, "learning_rate": 4.77124183006536e-06, "logits/chosen": -1.3652091026306152, "logits/rejected": -0.6764923334121704, "logps/chosen": -348.77593994140625, "logps/rejected": -372.30548095703125, "loss": 0.5272, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5752251744270325, "rewards/margins": 0.49377313256263733, "rewards/rejected": -1.0689985752105713, "step": 730 }, { "epoch": 0.1, "grad_norm": 10.75, "learning_rate": 4.836601307189543e-06, "logits/chosen": -0.3494882583618164, "logits/rejected": -0.1411532610654831, "logps/chosen": -328.62261962890625, "logps/rejected": -353.97772216796875, "loss": 0.5619, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8632782697677612, "rewards/margins": 0.5012267827987671, "rewards/rejected": -1.3645050525665283, "step": 740 }, { "epoch": 0.1, "grad_norm": 9.0, "learning_rate": 4.901960784313726e-06, "logits/chosen": 0.3033140301704407, "logits/rejected": 0.7218297719955444, "logps/chosen": -423.762451171875, "logps/rejected": -459.82977294921875, "loss": 0.5727, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9222949743270874, "rewards/margins": 0.4701816141605377, "rewards/rejected": -2.3924765586853027, "step": 750 }, { "epoch": 0.1, "grad_norm": 9.25, "learning_rate": 4.967320261437909e-06, "logits/chosen": -0.20161783695220947, "logits/rejected": 0.31453028321266174, "logps/chosen": -453.60162353515625, "logps/rejected": -451.3170471191406, "loss": 0.6744, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9592533111572266, "rewards/margins": 0.2076384574174881, "rewards/rejected": -2.1668918132781982, "step": 760 }, { "epoch": 0.1, "grad_norm": 12.875, "learning_rate": 4.99999347843947e-06, "logits/chosen": -1.8158515691757202, "logits/rejected": -1.601609230041504, "logps/chosen": -350.30145263671875, "logps/rejected": -348.83441162109375, "loss": 0.5921, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.057814359664917, "rewards/margins": 0.3119596242904663, "rewards/rejected": -1.3697741031646729, "step": 770 }, { "epoch": 0.1, "grad_norm": 7.625, "learning_rate": 4.999941306159375e-06, "logits/chosen": -2.322629451751709, "logits/rejected": -2.3233141899108887, "logps/chosen": -302.6885986328125, "logps/rejected": -347.59197998046875, "loss": 0.5802, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5237258672714233, "rewards/margins": 0.384928822517395, "rewards/rejected": -0.9086545705795288, "step": 780 }, { "epoch": 0.1, "grad_norm": 13.0625, "learning_rate": 4.999836962687967e-06, "logits/chosen": -1.4795883893966675, "logits/rejected": -1.3030979633331299, "logps/chosen": -359.45623779296875, "logps/rejected": -399.82476806640625, "loss": 0.6045, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2977018356323242, "rewards/margins": 0.38991469144821167, "rewards/rejected": -1.6876163482666016, "step": 790 }, { "epoch": 0.1, "grad_norm": 6.9375, "learning_rate": 4.999680450202786e-06, "logits/chosen": -0.6206024289131165, "logits/rejected": -0.4429781436920166, "logps/chosen": -472.45892333984375, "logps/rejected": -505.4447326660156, "loss": 0.6171, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7535765171051025, "rewards/margins": 0.31215736269950867, "rewards/rejected": -2.0657336711883545, "step": 800 }, { "epoch": 0.1, "eval_logits/chosen": -0.3348754942417145, "eval_logits/rejected": 0.023464158177375793, "eval_logps/chosen": -440.0762023925781, "eval_logps/rejected": -450.9858093261719, "eval_loss": 0.6371665000915527, "eval_rewards/accuracies": 0.640500009059906, "eval_rewards/chosen": -1.7545514106750488, "eval_rewards/margins": 0.3095169961452484, "eval_rewards/rejected": -2.06406831741333, "eval_runtime": 1593.2658, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.314, "step": 800 }, { "epoch": 0.11, "grad_norm": 14.9375, "learning_rate": 4.999471771970087e-06, "logits/chosen": -0.6086374521255493, "logits/rejected": -0.3623660206794739, "logps/chosen": -411.4615783691406, "logps/rejected": -428.5804748535156, "loss": 0.6063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6526752710342407, "rewards/margins": 0.3578190803527832, "rewards/rejected": -2.0104942321777344, "step": 810 }, { "epoch": 0.11, "grad_norm": 12.8125, "learning_rate": 4.999210932344767e-06, "logits/chosen": -1.3322651386260986, "logits/rejected": -1.328564167022705, "logps/chosen": -397.54046630859375, "logps/rejected": -491.22869873046875, "loss": 0.6269, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5655710697174072, "rewards/margins": 0.4129908084869385, "rewards/rejected": -1.9785619974136353, "step": 820 }, { "epoch": 0.11, "grad_norm": 9.5, "learning_rate": 4.998897936770281e-06, "logits/chosen": -1.9537856578826904, "logits/rejected": -1.8272556066513062, "logps/chosen": -436.32110595703125, "logps/rejected": -406.75750732421875, "loss": 0.5976, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.332761526107788, "rewards/margins": 0.4022786021232605, "rewards/rejected": -1.735040307044983, "step": 830 }, { "epoch": 0.11, "grad_norm": 9.3125, "learning_rate": 4.998532791778521e-06, "logits/chosen": -2.0961689949035645, "logits/rejected": -2.0401535034179688, "logps/chosen": -322.5164489746094, "logps/rejected": -366.49139404296875, "loss": 0.6556, "rewards/accuracies": 0.625, "rewards/chosen": -0.9384282827377319, "rewards/margins": 0.1931869089603424, "rewards/rejected": -1.1316152811050415, "step": 840 }, { "epoch": 0.11, "grad_norm": 6.40625, "learning_rate": 4.9981155049896885e-06, "logits/chosen": -1.9329078197479248, "logits/rejected": -1.8825485706329346, "logps/chosen": -370.23004150390625, "logps/rejected": -405.8755798339844, "loss": 0.5015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8216459155082703, "rewards/margins": 0.5655540823936462, "rewards/rejected": -1.387199878692627, "step": 850 }, { "epoch": 0.11, "grad_norm": 10.3125, "learning_rate": 4.997646085112126e-06, "logits/chosen": -1.2966748476028442, "logits/rejected": -1.0200374126434326, "logps/chosen": -397.69146728515625, "logps/rejected": -391.2544860839844, "loss": 0.6826, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3053414821624756, "rewards/margins": 0.20140385627746582, "rewards/rejected": -1.506745457649231, "step": 860 }, { "epoch": 0.11, "grad_norm": 12.9375, "learning_rate": 4.997124541942141e-06, "logits/chosen": -0.5647540092468262, "logits/rejected": -0.14192932844161987, "logps/chosen": -369.59539794921875, "logps/rejected": -405.70843505859375, "loss": 0.6578, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4736089706420898, "rewards/margins": 0.299024760723114, "rewards/rejected": -1.7726337909698486, "step": 870 }, { "epoch": 0.12, "grad_norm": 6.34375, "learning_rate": 4.996550886363801e-06, "logits/chosen": -0.5589116811752319, "logits/rejected": -0.307116836309433, "logps/chosen": -302.707275390625, "logps/rejected": -328.901123046875, "loss": 0.5766, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9649427533149719, "rewards/margins": 0.3963368535041809, "rewards/rejected": -1.3612796068191528, "step": 880 }, { "epoch": 0.12, "grad_norm": 9.625, "learning_rate": 4.995925130348706e-06, "logits/chosen": 0.031105151399970055, "logits/rejected": 0.044142670929431915, "logps/chosen": -287.320068359375, "logps/rejected": -327.9627990722656, "loss": 0.6079, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.937950611114502, "rewards/margins": 0.3377194404602051, "rewards/rejected": -1.2756701707839966, "step": 890 }, { "epoch": 0.12, "grad_norm": 9.4375, "learning_rate": 4.995247286955734e-06, "logits/chosen": 1.0088450908660889, "logits/rejected": 1.7407630681991577, "logps/chosen": -457.3868103027344, "logps/rejected": -457.37579345703125, "loss": 0.5553, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2731468677520752, "rewards/margins": 0.5888018012046814, "rewards/rejected": -1.8619487285614014, "step": 900 }, { "epoch": 0.12, "eval_logits/chosen": 1.9978185892105103, "eval_logits/rejected": 2.6186578273773193, "eval_logps/chosen": -399.6167907714844, "eval_logps/rejected": -429.9809265136719, "eval_loss": 0.5687113404273987, "eval_rewards/accuracies": 0.6930000185966492, "eval_rewards/chosen": -1.3499573469161987, "eval_rewards/margins": 0.5040626525878906, "eval_rewards/rejected": -1.8540199995040894, "eval_runtime": 1593.662, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.314, "step": 900 }, { "epoch": 0.12, "grad_norm": 10.3125, "learning_rate": 4.994517370330779e-06, "logits/chosen": 0.6192656755447388, "logits/rejected": 1.669968605041504, "logps/chosen": -435.8106994628906, "logps/rejected": -448.48309326171875, "loss": 0.5258, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4264830350875854, "rewards/margins": 0.6171587705612183, "rewards/rejected": -2.043642044067383, "step": 910 }, { "epoch": 0.12, "grad_norm": 15.6875, "learning_rate": 4.993735395706446e-06, "logits/chosen": 1.1262381076812744, "logits/rejected": 1.8840898275375366, "logps/chosen": -394.78826904296875, "logps/rejected": -433.81988525390625, "loss": 0.5314, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5628669261932373, "rewards/margins": 0.6049157977104187, "rewards/rejected": -2.167782783508301, "step": 920 }, { "epoch": 0.12, "grad_norm": 11.25, "learning_rate": 4.992901379401737e-06, "logits/chosen": 0.14038251340389252, "logits/rejected": 0.24134019017219543, "logps/chosen": -398.86578369140625, "logps/rejected": -472.0187072753906, "loss": 0.5735, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3214950561523438, "rewards/margins": 0.5916348695755005, "rewards/rejected": -1.9131300449371338, "step": 930 }, { "epoch": 0.12, "grad_norm": 11.75, "learning_rate": 4.992015338821711e-06, "logits/chosen": -0.04747029393911362, "logits/rejected": 1.1106561422348022, "logps/chosen": -391.7012023925781, "logps/rejected": -389.11602783203125, "loss": 0.6282, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.212070345878601, "rewards/margins": 0.5102335214614868, "rewards/rejected": -1.7223039865493774, "step": 940 }, { "epoch": 0.12, "grad_norm": 13.375, "learning_rate": 4.991077292457117e-06, "logits/chosen": 1.087203860282898, "logits/rejected": 1.1788231134414673, "logps/chosen": -340.7041931152344, "logps/rejected": -388.44293212890625, "loss": 0.6201, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0653443336486816, "rewards/margins": 0.430236279964447, "rewards/rejected": -1.4955805540084839, "step": 950 }, { "epoch": 0.13, "grad_norm": 7.65625, "learning_rate": 4.990087259884016e-06, "logits/chosen": 0.7325303554534912, "logits/rejected": 1.3778412342071533, "logps/chosen": -308.00799560546875, "logps/rejected": -357.46337890625, "loss": 0.5209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8226054310798645, "rewards/margins": 0.6232573390007019, "rewards/rejected": -1.445862889289856, "step": 960 }, { "epoch": 0.13, "grad_norm": 17.875, "learning_rate": 4.989045261763362e-06, "logits/chosen": 1.262274980545044, "logits/rejected": 1.1897504329681396, "logps/chosen": -346.5185546875, "logps/rejected": -419.4173278808594, "loss": 0.5454, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0726191997528076, "rewards/margins": 0.6499998569488525, "rewards/rejected": -1.7226190567016602, "step": 970 }, { "epoch": 0.13, "grad_norm": 10.4375, "learning_rate": 4.98795131984058e-06, "logits/chosen": 0.36344969272613525, "logits/rejected": 1.0654281377792358, "logps/chosen": -407.2535705566406, "logps/rejected": -422.3675842285156, "loss": 0.631, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3320995569229126, "rewards/margins": 0.524221658706665, "rewards/rejected": -1.856321096420288, "step": 980 }, { "epoch": 0.13, "grad_norm": 5.875, "learning_rate": 4.986805456945107e-06, "logits/chosen": 1.200073480606079, "logits/rejected": 2.0859665870666504, "logps/chosen": -359.2008361816406, "logps/rejected": -392.6002502441406, "loss": 0.5752, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1315548419952393, "rewards/margins": 0.49356216192245483, "rewards/rejected": -1.6251170635223389, "step": 990 }, { "epoch": 0.13, "grad_norm": 16.5, "learning_rate": 4.985607696989919e-06, "logits/chosen": 1.4213409423828125, "logits/rejected": 2.354599714279175, "logps/chosen": -426.97015380859375, "logps/rejected": -458.9466857910156, "loss": 0.6299, "rewards/accuracies": 0.625, "rewards/chosen": -1.4458622932434082, "rewards/margins": 0.4511790871620178, "rewards/rejected": -1.8970413208007812, "step": 1000 }, { "epoch": 0.13, "eval_logits/chosen": 2.715547561645508, "eval_logits/rejected": 3.4191699028015137, "eval_logps/chosen": -380.9113464355469, "eval_logps/rejected": -419.2182312011719, "eval_loss": 0.5619704723358154, "eval_rewards/accuracies": 0.6974999904632568, "eval_rewards/chosen": -1.1629031896591187, "eval_rewards/margins": 0.5834897756576538, "eval_rewards/rejected": -1.746392846107483, "eval_runtime": 1593.8153, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.314, "step": 1000 }, { "epoch": 0.13, "grad_norm": 9.5, "learning_rate": 4.984358064971026e-06, "logits/chosen": 1.5299071073532104, "logits/rejected": 1.7706670761108398, "logps/chosen": -340.0618591308594, "logps/rejected": -408.04302978515625, "loss": 0.5532, "rewards/accuracies": 0.75, "rewards/chosen": -0.9399666786193848, "rewards/margins": 0.690420925617218, "rewards/rejected": -1.6303876638412476, "step": 1010 }, { "epoch": 0.13, "grad_norm": 9.9375, "learning_rate": 4.983056586966958e-06, "logits/chosen": 0.010038676671683788, "logits/rejected": 0.8103101849555969, "logps/chosen": -340.81524658203125, "logps/rejected": -335.1512145996094, "loss": 0.563, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7835585474967957, "rewards/margins": 0.46336811780929565, "rewards/rejected": -1.2469266653060913, "step": 1020 }, { "epoch": 0.13, "grad_norm": 16.5, "learning_rate": 4.981703290138215e-06, "logits/chosen": 1.5737102031707764, "logits/rejected": 1.8098487854003906, "logps/chosen": -342.5895690917969, "logps/rejected": -421.34698486328125, "loss": 0.5058, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0919854640960693, "rewards/margins": 0.6574614644050598, "rewards/rejected": -1.7494471073150635, "step": 1030 }, { "epoch": 0.14, "grad_norm": 18.0, "learning_rate": 4.980298202726706e-06, "logits/chosen": 2.3896193504333496, "logits/rejected": 2.306922435760498, "logps/chosen": -377.5128479003906, "logps/rejected": -464.7425842285156, "loss": 0.566, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3240822553634644, "rewards/margins": 0.7185450792312622, "rewards/rejected": -2.0426273345947266, "step": 1040 }, { "epoch": 0.14, "grad_norm": 9.5625, "learning_rate": 4.978841354055148e-06, "logits/chosen": 1.665701150894165, "logits/rejected": 2.199453592300415, "logps/chosen": -363.497314453125, "logps/rejected": -445.28619384765625, "loss": 0.5545, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3350220918655396, "rewards/margins": 0.6997233629226685, "rewards/rejected": -2.034745216369629, "step": 1050 }, { "epoch": 0.14, "grad_norm": 8.5, "learning_rate": 4.977332774526471e-06, "logits/chosen": 1.5860077142715454, "logits/rejected": 2.17120099067688, "logps/chosen": -387.22027587890625, "logps/rejected": -447.6641540527344, "loss": 0.5731, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2284187078475952, "rewards/margins": 0.6316946148872375, "rewards/rejected": -1.860113501548767, "step": 1060 }, { "epoch": 0.14, "grad_norm": 9.3125, "learning_rate": 4.97577249562317e-06, "logits/chosen": 1.3059680461883545, "logits/rejected": 2.729459047317505, "logps/chosen": -400.4163513183594, "logps/rejected": -435.00238037109375, "loss": 0.5853, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3927881717681885, "rewards/margins": 0.6526451110839844, "rewards/rejected": -2.045433282852173, "step": 1070 }, { "epoch": 0.14, "grad_norm": 13.6875, "learning_rate": 4.974160549906652e-06, "logits/chosen": 1.9924103021621704, "logits/rejected": 2.650087356567383, "logps/chosen": -385.2442932128906, "logps/rejected": -420.45648193359375, "loss": 0.584, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.272428035736084, "rewards/margins": 0.5629833340644836, "rewards/rejected": -1.8354114294052124, "step": 1080 }, { "epoch": 0.14, "grad_norm": 8.125, "learning_rate": 4.972496971016559e-06, "logits/chosen": 3.1397948265075684, "logits/rejected": 3.477116107940674, "logps/chosen": -447.5340881347656, "logps/rejected": -512.5313110351562, "loss": 0.4966, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.014864206314087, "rewards/margins": 0.7934069037437439, "rewards/rejected": -2.8082709312438965, "step": 1090 }, { "epoch": 0.14, "grad_norm": 10.5, "learning_rate": 4.9707817936700635e-06, "logits/chosen": 2.5927772521972656, "logits/rejected": 4.051385879516602, "logps/chosen": -492.2637634277344, "logps/rejected": -532.87060546875, "loss": 0.5898, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3247809410095215, "rewards/margins": 0.7253618836402893, "rewards/rejected": -3.050142765045166, "step": 1100 }, { "epoch": 0.14, "eval_logits/chosen": 4.413373947143555, "eval_logits/rejected": 5.307820796966553, "eval_logps/chosen": -508.3033447265625, "eval_logps/rejected": -554.2042236328125, "eval_loss": 0.561853289604187, "eval_rewards/accuracies": 0.7089999914169312, "eval_rewards/chosen": -2.4368231296539307, "eval_rewards/margins": 0.6594300270080566, "eval_rewards/rejected": -3.0962531566619873, "eval_runtime": 1593.906, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.314, "step": 1100 }, { "epoch": 0.15, "grad_norm": 8.1875, "learning_rate": 4.969015053661142e-06, "logits/chosen": 2.426276683807373, "logits/rejected": 4.329206943511963, "logps/chosen": -526.8760986328125, "logps/rejected": -530.1583251953125, "loss": 0.5581, "rewards/accuracies": 0.75, "rewards/chosen": -2.395789384841919, "rewards/margins": 0.6795339584350586, "rewards/rejected": -3.0753233432769775, "step": 1110 }, { "epoch": 0.15, "grad_norm": 5.4375, "learning_rate": 4.967196787859835e-06, "logits/chosen": 2.7714107036590576, "logits/rejected": 3.8114936351776123, "logps/chosen": -434.1639099121094, "logps/rejected": -502.12945556640625, "loss": 0.5417, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9444061517715454, "rewards/margins": 0.7909334301948547, "rewards/rejected": -2.735339403152466, "step": 1120 }, { "epoch": 0.15, "grad_norm": 14.0625, "learning_rate": 4.965327034211469e-06, "logits/chosen": 1.6808677911758423, "logits/rejected": 2.370750904083252, "logps/chosen": -446.2379455566406, "logps/rejected": -482.42315673828125, "loss": 0.6304, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5557124614715576, "rewards/margins": 0.4479561448097229, "rewards/rejected": -2.003668785095215, "step": 1130 }, { "epoch": 0.15, "grad_norm": 8.25, "learning_rate": 4.96340583173587e-06, "logits/chosen": 1.121919870376587, "logits/rejected": 2.6305222511291504, "logps/chosen": -355.7509460449219, "logps/rejected": -417.93560791015625, "loss": 0.4562, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1430405378341675, "rewards/margins": 0.7580685615539551, "rewards/rejected": -1.901108980178833, "step": 1140 }, { "epoch": 0.15, "grad_norm": 9.9375, "learning_rate": 4.96143322052655e-06, "logits/chosen": 2.945784330368042, "logits/rejected": 3.851064682006836, "logps/chosen": -396.95355224609375, "logps/rejected": -517.4216918945312, "loss": 0.5018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6020971536636353, "rewards/margins": 0.9275411367416382, "rewards/rejected": -2.5296382904052734, "step": 1150 }, { "epoch": 0.15, "grad_norm": 9.625, "learning_rate": 4.959409241749864e-06, "logits/chosen": 2.93442702293396, "logits/rejected": 3.921786069869995, "logps/chosen": -467.01605224609375, "logps/rejected": -536.6296997070312, "loss": 0.6158, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.274714708328247, "rewards/margins": 0.7702791690826416, "rewards/rejected": -3.0449936389923096, "step": 1160 }, { "epoch": 0.15, "grad_norm": 13.4375, "learning_rate": 4.957333937644159e-06, "logits/chosen": 2.806804656982422, "logits/rejected": 3.4476191997528076, "logps/chosen": -494.03155517578125, "logps/rejected": -536.6123657226562, "loss": 0.6034, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2405643463134766, "rewards/margins": 0.6038447022438049, "rewards/rejected": -2.844409227371216, "step": 1170 }, { "epoch": 0.15, "grad_norm": 7.75, "learning_rate": 4.955207351518885e-06, "logits/chosen": 2.9292523860931396, "logits/rejected": 3.8048958778381348, "logps/chosen": -519.1878662109375, "logps/rejected": -557.2689208984375, "loss": 0.5872, "rewards/accuracies": 0.75, "rewards/chosen": -2.184176206588745, "rewards/margins": 0.6229387521743774, "rewards/rejected": -2.807114839553833, "step": 1180 }, { "epoch": 0.16, "grad_norm": 6.125, "learning_rate": 4.953029527753699e-06, "logits/chosen": 2.54266357421875, "logits/rejected": 3.0706546306610107, "logps/chosen": -409.0013122558594, "logps/rejected": -532.3023681640625, "loss": 0.5531, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8078029155731201, "rewards/margins": 0.9258928298950195, "rewards/rejected": -2.7336957454681396, "step": 1190 }, { "epoch": 0.16, "grad_norm": 18.25, "learning_rate": 4.95080051179753e-06, "logits/chosen": 2.138737678527832, "logits/rejected": 2.217345952987671, "logps/chosen": -414.7706604003906, "logps/rejected": -497.00506591796875, "loss": 0.4782, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3488829135894775, "rewards/margins": 0.9236246943473816, "rewards/rejected": -2.272507429122925, "step": 1200 }, { "epoch": 0.16, "eval_logits/chosen": 3.148491859436035, "eval_logits/rejected": 4.01865291595459, "eval_logps/chosen": -415.2228698730469, "eval_logps/rejected": -468.4132385253906, "eval_loss": 0.5594326257705688, "eval_rewards/accuracies": 0.7089999914169312, "eval_rewards/chosen": -1.5060184001922607, "eval_rewards/margins": 0.7323250770568848, "eval_rewards/rejected": -2.2383434772491455, "eval_runtime": 1593.535, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.314, "step": 1200 }, { "epoch": 0.16, "grad_norm": 13.6875, "learning_rate": 4.948520350167637e-06, "logits/chosen": 1.060068130493164, "logits/rejected": 2.634003162384033, "logps/chosen": -432.3033142089844, "logps/rejected": -470.77783203125, "loss": 0.5074, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6320292949676514, "rewards/margins": 0.7347411513328552, "rewards/rejected": -2.3667702674865723, "step": 1210 }, { "epoch": 0.16, "grad_norm": 8.5625, "learning_rate": 4.946189090448639e-06, "logits/chosen": 1.204738974571228, "logits/rejected": 2.704601287841797, "logps/chosen": -428.56695556640625, "logps/rejected": -472.8544006347656, "loss": 0.5472, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6184256076812744, "rewards/margins": 0.8675263524055481, "rewards/rejected": -2.4859519004821777, "step": 1220 }, { "epoch": 0.16, "grad_norm": 7.78125, "learning_rate": 4.943806781291515e-06, "logits/chosen": 2.1226108074188232, "logits/rejected": 3.0666372776031494, "logps/chosen": -409.69342041015625, "logps/rejected": -433.8475646972656, "loss": 0.6601, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4817649126052856, "rewards/margins": 0.5832273960113525, "rewards/rejected": -2.0649921894073486, "step": 1230 }, { "epoch": 0.16, "grad_norm": 22.0, "learning_rate": 4.941373472412595e-06, "logits/chosen": 2.166796922683716, "logits/rejected": 3.9252426624298096, "logps/chosen": -435.42559814453125, "logps/rejected": -491.82464599609375, "loss": 0.5594, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7724716663360596, "rewards/margins": 0.9388161897659302, "rewards/rejected": -2.7112877368927, "step": 1240 }, { "epoch": 0.16, "grad_norm": 7.5625, "learning_rate": 4.938889214592521e-06, "logits/chosen": 2.505998373031616, "logits/rejected": 3.1269757747650146, "logps/chosen": -388.64581298828125, "logps/rejected": -475.17303466796875, "loss": 0.4631, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7814483642578125, "rewards/margins": 1.0449895858764648, "rewards/rejected": -2.8264379501342773, "step": 1250 }, { "epoch": 0.16, "grad_norm": 13.5625, "learning_rate": 4.936354059675186e-06, "logits/chosen": 0.6287264823913574, "logits/rejected": 1.3099499940872192, "logps/chosen": -386.27423095703125, "logps/rejected": -423.3670959472656, "loss": 0.6161, "rewards/accuracies": 0.6875, "rewards/chosen": -1.42852783203125, "rewards/margins": 0.5442193150520325, "rewards/rejected": -1.9727470874786377, "step": 1260 }, { "epoch": 0.17, "grad_norm": 8.6875, "learning_rate": 4.933768060566654e-06, "logits/chosen": -0.03582334518432617, "logits/rejected": 1.0751930475234985, "logps/chosen": -376.5238952636719, "logps/rejected": -389.35760498046875, "loss": 0.5561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0833394527435303, "rewards/margins": 0.5493221879005432, "rewards/rejected": -1.6326615810394287, "step": 1270 }, { "epoch": 0.17, "grad_norm": 7.71875, "learning_rate": 4.931131271234052e-06, "logits/chosen": 0.22822928428649902, "logits/rejected": 1.3094651699066162, "logps/chosen": -337.14202880859375, "logps/rejected": -388.39044189453125, "loss": 0.5507, "rewards/accuracies": 0.75, "rewards/chosen": -0.9693458676338196, "rewards/margins": 0.6725118160247803, "rewards/rejected": -1.6418577432632446, "step": 1280 }, { "epoch": 0.17, "grad_norm": 16.125, "learning_rate": 4.928443746704448e-06, "logits/chosen": 1.678131341934204, "logits/rejected": 2.727170944213867, "logps/chosen": -398.09490966796875, "logps/rejected": -431.4143981933594, "loss": 0.6014, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.361238956451416, "rewards/margins": 0.689416766166687, "rewards/rejected": -2.0506556034088135, "step": 1290 }, { "epoch": 0.17, "grad_norm": 8.75, "learning_rate": 4.925705543063703e-06, "logits/chosen": 1.2936311960220337, "logits/rejected": 2.5942752361297607, "logps/chosen": -441.23931884765625, "logps/rejected": -458.55279541015625, "loss": 0.5709, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5207133293151855, "rewards/margins": 0.6074548959732056, "rewards/rejected": -2.1281678676605225, "step": 1300 }, { "epoch": 0.17, "eval_logits/chosen": 3.2569968700408936, "eval_logits/rejected": 4.131510257720947, "eval_logps/chosen": -437.77825927734375, "eval_logps/rejected": -481.25823974609375, "eval_loss": 0.5480995774269104, "eval_rewards/accuracies": 0.7245000004768372, "eval_rewards/chosen": -1.7315717935562134, "eval_rewards/margins": 0.6352214813232422, "eval_rewards/rejected": -2.366793394088745, "eval_runtime": 1593.4224, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.314, "step": 1300 }, { "epoch": 0.17, "grad_norm": 10.75, "learning_rate": 4.922916717455297e-06, "logits/chosen": 2.208543300628662, "logits/rejected": 3.476393938064575, "logps/chosen": -419.43719482421875, "logps/rejected": -469.15582275390625, "loss": 0.4688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7308155298233032, "rewards/margins": 0.9251171946525574, "rewards/rejected": -2.655932903289795, "step": 1310 }, { "epoch": 0.17, "grad_norm": 14.125, "learning_rate": 4.920077328079136e-06, "logits/chosen": 0.9731375575065613, "logits/rejected": 1.9899227619171143, "logps/chosen": -395.8177795410156, "logps/rejected": -478.02020263671875, "loss": 0.5184, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.540832757949829, "rewards/margins": 0.8474717140197754, "rewards/rejected": -2.3883044719696045, "step": 1320 }, { "epoch": 0.17, "grad_norm": 10.6875, "learning_rate": 4.9171874341903445e-06, "logits/chosen": 0.6347458362579346, "logits/rejected": 1.3302295207977295, "logps/chosen": -389.74908447265625, "logps/rejected": -467.36187744140625, "loss": 0.5382, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3787751197814941, "rewards/margins": 0.6735467910766602, "rewards/rejected": -2.0523219108581543, "step": 1330 }, { "epoch": 0.18, "grad_norm": 10.625, "learning_rate": 4.914247096098019e-06, "logits/chosen": 0.6093196272850037, "logits/rejected": 1.4135183095932007, "logps/chosen": -438.0009765625, "logps/rejected": -489.31610107421875, "loss": 0.5539, "rewards/accuracies": 0.6875, "rewards/chosen": -1.742841124534607, "rewards/margins": 0.6880618333816528, "rewards/rejected": -2.4309029579162598, "step": 1340 }, { "epoch": 0.18, "grad_norm": 17.0, "learning_rate": 4.911256375163977e-06, "logits/chosen": 2.0072357654571533, "logits/rejected": 1.9592632055282593, "logps/chosen": -486.7958984375, "logps/rejected": -544.7276611328125, "loss": 0.6466, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.335031270980835, "rewards/margins": 0.4429550766944885, "rewards/rejected": -2.7779860496520996, "step": 1350 }, { "epoch": 0.18, "grad_norm": 10.25, "learning_rate": 4.908215333801474e-06, "logits/chosen": 2.0604007244110107, "logits/rejected": 2.6346802711486816, "logps/chosen": -434.343017578125, "logps/rejected": -519.6720581054688, "loss": 0.5155, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1249516010284424, "rewards/margins": 0.7996464967727661, "rewards/rejected": -2.92459774017334, "step": 1360 }, { "epoch": 0.18, "grad_norm": 8.9375, "learning_rate": 4.9051240354739004e-06, "logits/chosen": 1.6247851848602295, "logits/rejected": 2.725818157196045, "logps/chosen": -488.3594665527344, "logps/rejected": -547.042724609375, "loss": 0.4948, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.056096315383911, "rewards/margins": 0.8274715542793274, "rewards/rejected": -2.8835678100585938, "step": 1370 }, { "epoch": 0.18, "grad_norm": 17.0, "learning_rate": 4.901982544693457e-06, "logits/chosen": 2.3090920448303223, "logits/rejected": 3.1643126010894775, "logps/chosen": -486.04339599609375, "logps/rejected": -573.066650390625, "loss": 0.4762, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1145107746124268, "rewards/margins": 1.0138976573944092, "rewards/rejected": -3.128408432006836, "step": 1380 }, { "epoch": 0.18, "grad_norm": 14.0, "learning_rate": 4.898790927019809e-06, "logits/chosen": 2.4458489418029785, "logits/rejected": 3.7828209400177, "logps/chosen": -495.18939208984375, "logps/rejected": -551.55126953125, "loss": 0.5416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4461328983306885, "rewards/margins": 0.7788988351821899, "rewards/rejected": -3.225032091140747, "step": 1390 }, { "epoch": 0.18, "grad_norm": 7.71875, "learning_rate": 4.895549249058718e-06, "logits/chosen": 2.8335728645324707, "logits/rejected": 3.7893567085266113, "logps/chosen": -523.3660888671875, "logps/rejected": -584.7088623046875, "loss": 0.5181, "rewards/accuracies": 0.75, "rewards/chosen": -2.668732166290283, "rewards/margins": 0.8389676213264465, "rewards/rejected": -3.507699966430664, "step": 1400 }, { "epoch": 0.18, "eval_logits/chosen": 3.6943774223327637, "eval_logits/rejected": 4.697685241699219, "eval_logps/chosen": -513.1900024414062, "eval_logps/rejected": -583.56396484375, "eval_loss": 0.5454376935958862, "eval_rewards/accuracies": 0.7139999866485596, "eval_rewards/chosen": -2.4856903553009033, "eval_rewards/margins": 0.904159665107727, "eval_rewards/rejected": -3.389849901199341, "eval_runtime": 1592.6989, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 1400 }, { "epoch": 0.18, "grad_norm": 8.6875, "learning_rate": 4.892257578460656e-06, "logits/chosen": 1.9299166202545166, "logits/rejected": 2.9093315601348877, "logps/chosen": -498.1167907714844, "logps/rejected": -564.88330078125, "loss": 0.6278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.382009983062744, "rewards/margins": 0.6775062680244446, "rewards/rejected": -3.059515953063965, "step": 1410 }, { "epoch": 0.19, "grad_norm": 9.3125, "learning_rate": 4.888915983919383e-06, "logits/chosen": 1.348301649093628, "logits/rejected": 2.4204530715942383, "logps/chosen": -446.24456787109375, "logps/rejected": -506.3639221191406, "loss": 0.5718, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0002195835113525, "rewards/margins": 0.7560812830924988, "rewards/rejected": -2.756300926208496, "step": 1420 }, { "epoch": 0.19, "grad_norm": 5.8125, "learning_rate": 4.885524535170525e-06, "logits/chosen": 1.6420373916625977, "logits/rejected": 2.3852953910827637, "logps/chosen": -449.477294921875, "logps/rejected": -495.0896911621094, "loss": 0.5018, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8907318115234375, "rewards/margins": 0.7322606444358826, "rewards/rejected": -2.622992515563965, "step": 1430 }, { "epoch": 0.19, "grad_norm": 8.625, "learning_rate": 4.882083302990113e-06, "logits/chosen": 1.841507911682129, "logits/rejected": 2.7790274620056152, "logps/chosen": -488.05938720703125, "logps/rejected": -542.5457763671875, "loss": 0.5289, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.215773105621338, "rewards/margins": 0.7926411032676697, "rewards/rejected": -3.0084145069122314, "step": 1440 }, { "epoch": 0.19, "grad_norm": 6.84375, "learning_rate": 4.878592359193104e-06, "logits/chosen": 2.2885186672210693, "logits/rejected": 2.9249768257141113, "logps/chosen": -469.7869567871094, "logps/rejected": -578.7250366210938, "loss": 0.5524, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.663689136505127, "rewards/margins": 0.9176554679870605, "rewards/rejected": -3.5813446044921875, "step": 1450 }, { "epoch": 0.19, "grad_norm": 10.0, "learning_rate": 4.875051776631888e-06, "logits/chosen": 2.708163022994995, "logits/rejected": 3.2748348712921143, "logps/chosen": -531.5555419921875, "logps/rejected": -622.9147338867188, "loss": 0.528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9948220252990723, "rewards/margins": 0.846728503704071, "rewards/rejected": -3.84155011177063, "step": 1460 }, { "epoch": 0.19, "grad_norm": 8.375, "learning_rate": 4.871461629194764e-06, "logits/chosen": 2.8306050300598145, "logits/rejected": 4.097413539886475, "logps/chosen": -667.3284912109375, "logps/rejected": -681.8602905273438, "loss": 0.6414, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.854893922805786, "rewards/margins": 0.5547782778739929, "rewards/rejected": -4.409672737121582, "step": 1470 }, { "epoch": 0.19, "grad_norm": 10.625, "learning_rate": 4.8678219918043984e-06, "logits/chosen": 2.2125823497772217, "logits/rejected": 2.6229119300842285, "logps/chosen": -522.2234497070312, "logps/rejected": -616.5860595703125, "loss": 0.5614, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.176955461502075, "rewards/margins": 0.8344646692276001, "rewards/rejected": -4.011419773101807, "step": 1480 }, { "epoch": 0.19, "grad_norm": 10.5, "learning_rate": 4.864132940416262e-06, "logits/chosen": 1.9612172842025757, "logits/rejected": 3.014679431915283, "logps/chosen": -548.0928344726562, "logps/rejected": -592.0524291992188, "loss": 0.5564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.925997495651245, "rewards/margins": 0.7844290137290955, "rewards/rejected": -3.710425853729248, "step": 1490 }, { "epoch": 0.2, "grad_norm": 14.375, "learning_rate": 4.860394552017044e-06, "logits/chosen": 2.663686990737915, "logits/rejected": 3.601266384124756, "logps/chosen": -519.1183471679688, "logps/rejected": -590.6419067382812, "loss": 0.5495, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.768385887145996, "rewards/margins": 0.7279509902000427, "rewards/rejected": -3.4963364601135254, "step": 1500 }, { "epoch": 0.2, "eval_logits/chosen": 3.288820266723633, "eval_logits/rejected": 4.1846513748168945, "eval_logps/chosen": -520.6432495117188, "eval_logps/rejected": -580.3214721679688, "eval_loss": 0.5428246855735779, "eval_rewards/accuracies": 0.7204999923706055, "eval_rewards/chosen": -2.5602221488952637, "eval_rewards/margins": 0.797203540802002, "eval_rewards/rejected": -3.3574254512786865, "eval_runtime": 1592.2923, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 1500 }, { "epoch": 0.2, "grad_norm": 7.96875, "learning_rate": 4.856606904623047e-06, "logits/chosen": 1.537600040435791, "logits/rejected": 2.585251569747925, "logps/chosen": -543.2179565429688, "logps/rejected": -580.5303344726562, "loss": 0.5442, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.247091293334961, "rewards/margins": 0.7817636728286743, "rewards/rejected": -3.0288548469543457, "step": 1510 }, { "epoch": 0.2, "grad_norm": 12.8125, "learning_rate": 4.852770077278557e-06, "logits/chosen": 1.9782816171646118, "logits/rejected": 2.7694122791290283, "logps/chosen": -489.9452209472656, "logps/rejected": -541.9598388671875, "loss": 0.4728, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1113080978393555, "rewards/margins": 0.8755633234977722, "rewards/rejected": -2.9868717193603516, "step": 1520 }, { "epoch": 0.2, "grad_norm": 6.71875, "learning_rate": 4.848884150054196e-06, "logits/chosen": 2.329437732696533, "logits/rejected": 2.7830517292022705, "logps/chosen": -549.2725219726562, "logps/rejected": -610.2501220703125, "loss": 0.5669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.0749659538269043, "rewards/margins": 0.8318060636520386, "rewards/rejected": -3.9067721366882324, "step": 1530 }, { "epoch": 0.2, "grad_norm": 12.6875, "learning_rate": 4.8449492040452495e-06, "logits/chosen": 2.3675220012664795, "logits/rejected": 3.3951308727264404, "logps/chosen": -585.8651733398438, "logps/rejected": -600.3690795898438, "loss": 0.6549, "rewards/accuracies": 0.6875, "rewards/chosen": -3.3787026405334473, "rewards/margins": 0.6356854438781738, "rewards/rejected": -4.014388084411621, "step": 1540 }, { "epoch": 0.2, "grad_norm": 15.125, "learning_rate": 4.840965321369973e-06, "logits/chosen": 2.4867424964904785, "logits/rejected": 3.2565741539001465, "logps/chosen": -635.0973510742188, "logps/rejected": -656.8734130859375, "loss": 0.6027, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.280498504638672, "rewards/margins": 0.65467768907547, "rewards/rejected": -3.935176134109497, "step": 1550 }, { "epoch": 0.2, "grad_norm": 18.625, "learning_rate": 4.8369325851678795e-06, "logits/chosen": 2.721409559249878, "logits/rejected": 3.499811887741089, "logps/chosen": -575.1268310546875, "logps/rejected": -651.55224609375, "loss": 0.5457, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1099348068237305, "rewards/margins": 0.82475346326828, "rewards/rejected": -3.934687852859497, "step": 1560 }, { "epoch": 0.21, "grad_norm": 8.8125, "learning_rate": 4.832851079598007e-06, "logits/chosen": 3.0259532928466797, "logits/rejected": 3.415708065032959, "logps/chosen": -615.1463623046875, "logps/rejected": -673.1775512695312, "loss": 0.5671, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.413778305053711, "rewards/margins": 0.6115729212760925, "rewards/rejected": -4.025351047515869, "step": 1570 }, { "epoch": 0.21, "grad_norm": 17.125, "learning_rate": 4.828720889837158e-06, "logits/chosen": 2.180393695831299, "logits/rejected": 3.560591459274292, "logps/chosen": -597.0384521484375, "logps/rejected": -648.1893310546875, "loss": 0.4638, "rewards/accuracies": 0.75, "rewards/chosen": -3.2695510387420654, "rewards/margins": 1.0302797555923462, "rewards/rejected": -4.299830436706543, "step": 1580 }, { "epoch": 0.21, "grad_norm": 16.125, "learning_rate": 4.824542102078125e-06, "logits/chosen": 2.5585384368896484, "logits/rejected": 3.3628134727478027, "logps/chosen": -559.4739379882812, "logps/rejected": -660.89013671875, "loss": 0.4251, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.904505491256714, "rewards/margins": 1.1467950344085693, "rewards/rejected": -4.051300525665283, "step": 1590 }, { "epoch": 0.21, "grad_norm": 8.5625, "learning_rate": 4.820314803527888e-06, "logits/chosen": 2.209808588027954, "logits/rejected": 2.755720853805542, "logps/chosen": -527.7376708984375, "logps/rejected": -611.8505249023438, "loss": 0.574, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.922621011734009, "rewards/margins": 0.8603001832962036, "rewards/rejected": -3.782921314239502, "step": 1600 }, { "epoch": 0.21, "eval_logits/chosen": 3.930431842803955, "eval_logits/rejected": 4.921872138977051, "eval_logps/chosen": -535.627685546875, "eval_logps/rejected": -599.0427856445312, "eval_loss": 0.5638437271118164, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": -2.710066795349121, "eval_rewards/margins": 0.8345724940299988, "eval_rewards/rejected": -3.5446391105651855, "eval_runtime": 1592.9451, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 1600 }, { "epoch": 0.21, "grad_norm": 15.75, "learning_rate": 4.816039082405799e-06, "logits/chosen": 2.2764475345611572, "logits/rejected": 2.266634941101074, "logps/chosen": -485.3955993652344, "logps/rejected": -614.9955444335938, "loss": 0.5072, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3901500701904297, "rewards/margins": 0.995215117931366, "rewards/rejected": -3.3853652477264404, "step": 1610 }, { "epoch": 0.21, "grad_norm": 7.59375, "learning_rate": 4.81171502794174e-06, "logits/chosen": 2.212632417678833, "logits/rejected": 3.119572401046753, "logps/chosen": -431.06182861328125, "logps/rejected": -525.5626220703125, "loss": 0.5715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3643248081207275, "rewards/margins": 0.8816196322441101, "rewards/rejected": -3.2459442615509033, "step": 1620 }, { "epoch": 0.21, "grad_norm": 5.96875, "learning_rate": 4.8073427303742584e-06, "logits/chosen": 2.651143789291382, "logits/rejected": 3.788827896118164, "logps/chosen": -436.7945861816406, "logps/rejected": -547.8876953125, "loss": 0.5112, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.291949510574341, "rewards/margins": 1.0054199695587158, "rewards/rejected": -3.2973690032958984, "step": 1630 }, { "epoch": 0.21, "grad_norm": 17.625, "learning_rate": 4.802922280948685e-06, "logits/chosen": 2.9048256874084473, "logits/rejected": 2.298372507095337, "logps/chosen": -445.8115234375, "logps/rejected": -583.6910400390625, "loss": 0.5133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4814229011535645, "rewards/margins": 0.9504474401473999, "rewards/rejected": -3.431870698928833, "step": 1640 }, { "epoch": 0.22, "grad_norm": 9.75, "learning_rate": 4.798453771915231e-06, "logits/chosen": 2.131483316421509, "logits/rejected": 2.286714553833008, "logps/chosen": -550.1790161132812, "logps/rejected": -614.5077514648438, "loss": 0.5829, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8355047702789307, "rewards/margins": 0.6407086849212646, "rewards/rejected": -3.4762134552001953, "step": 1650 }, { "epoch": 0.22, "grad_norm": 10.0625, "learning_rate": 4.793937296527062e-06, "logits/chosen": 2.0528528690338135, "logits/rejected": 3.4341843128204346, "logps/chosen": -583.1158447265625, "logps/rejected": -599.8079223632812, "loss": 0.7019, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.1697096824645996, "rewards/margins": 0.5639287233352661, "rewards/rejected": -3.7336387634277344, "step": 1660 }, { "epoch": 0.22, "grad_norm": 6.5625, "learning_rate": 4.78937294903835e-06, "logits/chosen": 1.3232476711273193, "logits/rejected": 3.333897829055786, "logps/chosen": -524.6771240234375, "logps/rejected": -585.8258056640625, "loss": 0.4743, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5064637660980225, "rewards/margins": 1.062355875968933, "rewards/rejected": -3.568819522857666, "step": 1670 }, { "epoch": 0.22, "grad_norm": 12.1875, "learning_rate": 4.78476082470231e-06, "logits/chosen": 1.2536879777908325, "logits/rejected": 2.9530956745147705, "logps/chosen": -561.2202758789062, "logps/rejected": -656.8157958984375, "loss": 0.4279, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7209866046905518, "rewards/margins": 1.2556803226470947, "rewards/rejected": -3.9766669273376465, "step": 1680 }, { "epoch": 0.22, "grad_norm": 8.375, "learning_rate": 4.780101019769212e-06, "logits/chosen": 2.281708240509033, "logits/rejected": 3.858910322189331, "logps/chosen": -669.6038818359375, "logps/rejected": -732.0891723632812, "loss": 0.6202, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8110530376434326, "rewards/margins": 0.9638622403144836, "rewards/rejected": -4.7749152183532715, "step": 1690 }, { "epoch": 0.22, "grad_norm": 8.625, "learning_rate": 4.775393631484368e-06, "logits/chosen": 1.3264646530151367, "logits/rejected": 3.5289459228515625, "logps/chosen": -596.949462890625, "logps/rejected": -665.1720581054688, "loss": 0.4901, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0804290771484375, "rewards/margins": 1.0506683588027954, "rewards/rejected": -4.131097316741943, "step": 1700 }, { "epoch": 0.22, "eval_logits/chosen": 2.930548667907715, "eval_logits/rejected": 3.8220255374908447, "eval_logps/chosen": -513.6201171875, "eval_logps/rejected": -580.3493041992188, "eval_loss": 0.528390109539032, "eval_rewards/accuracies": 0.7335000038146973, "eval_rewards/chosen": -2.4899909496307373, "eval_rewards/margins": 0.8677131533622742, "eval_rewards/rejected": -3.3577041625976562, "eval_runtime": 1592.0798, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 1700 }, { "epoch": 0.22, "grad_norm": 5.625, "learning_rate": 4.770638758086105e-06, "logits/chosen": 1.39579176902771, "logits/rejected": 1.9867902994155884, "logps/chosen": -454.69696044921875, "logps/rejected": -540.9384765625, "loss": 0.5284, "rewards/accuracies": 0.75, "rewards/chosen": -1.9679079055786133, "rewards/margins": 0.7821384072303772, "rewards/rejected": -2.7500462532043457, "step": 1710 }, { "epoch": 0.23, "grad_norm": 15.625, "learning_rate": 4.7658364988037184e-06, "logits/chosen": 0.4757395386695862, "logits/rejected": 1.7275594472885132, "logps/chosen": -420.74237060546875, "logps/rejected": -450.6329040527344, "loss": 0.5074, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5048556327819824, "rewards/margins": 0.7989991903305054, "rewards/rejected": -2.3038547039031982, "step": 1720 }, { "epoch": 0.23, "grad_norm": 19.0, "learning_rate": 4.760986953855395e-06, "logits/chosen": 0.5297383069992065, "logits/rejected": 1.4827096462249756, "logps/chosen": -479.01263427734375, "logps/rejected": -472.995849609375, "loss": 0.5399, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6811622381210327, "rewards/margins": 0.7285897135734558, "rewards/rejected": -2.4097516536712646, "step": 1730 }, { "epoch": 0.23, "grad_norm": 17.875, "learning_rate": 4.756090224446127e-06, "logits/chosen": 1.6546382904052734, "logits/rejected": 2.8603973388671875, "logps/chosen": -511.18109130859375, "logps/rejected": -617.1567993164062, "loss": 0.4234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.4521613121032715, "rewards/margins": 1.1986668109893799, "rewards/rejected": -3.6508285999298096, "step": 1740 }, { "epoch": 0.23, "grad_norm": 15.9375, "learning_rate": 4.7511464127655945e-06, "logits/chosen": 2.412151575088501, "logits/rejected": 2.7965240478515625, "logps/chosen": -564.255859375, "logps/rejected": -674.6925048828125, "loss": 0.5344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.802210569381714, "rewards/margins": 1.1913468837738037, "rewards/rejected": -3.9935576915740967, "step": 1750 }, { "epoch": 0.23, "grad_norm": 19.25, "learning_rate": 4.74615562198604e-06, "logits/chosen": 2.0918798446655273, "logits/rejected": 3.590376615524292, "logps/chosen": -531.4205322265625, "logps/rejected": -628.2448120117188, "loss": 0.5798, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.993215322494507, "rewards/margins": 1.028847336769104, "rewards/rejected": -4.0220627784729, "step": 1760 }, { "epoch": 0.23, "grad_norm": 7.84375, "learning_rate": 4.741117956260107e-06, "logits/chosen": 1.7307329177856445, "logits/rejected": 2.0027899742126465, "logps/chosen": -564.1361083984375, "logps/rejected": -621.1322021484375, "loss": 0.5516, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.6234326362609863, "rewards/margins": 0.8255942463874817, "rewards/rejected": -3.4490268230438232, "step": 1770 }, { "epoch": 0.23, "grad_norm": 9.8125, "learning_rate": 4.736033520718672e-06, "logits/chosen": 0.6438279151916504, "logits/rejected": 1.2121880054473877, "logps/chosen": -454.52545166015625, "logps/rejected": -499.5397033691406, "loss": 0.5871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9997762441635132, "rewards/margins": 0.5949224233627319, "rewards/rejected": -2.594698667526245, "step": 1780 }, { "epoch": 0.23, "grad_norm": 15.125, "learning_rate": 4.730902421468652e-06, "logits/chosen": 0.7901066541671753, "logits/rejected": 0.7325730919837952, "logps/chosen": -462.3359375, "logps/rejected": -552.4173583984375, "loss": 0.5828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9955250024795532, "rewards/margins": 0.6368887424468994, "rewards/rejected": -2.632413864135742, "step": 1790 }, { "epoch": 0.24, "grad_norm": 6.65625, "learning_rate": 4.7257247655907854e-06, "logits/chosen": 0.637048602104187, "logits/rejected": 1.482924461364746, "logps/chosen": -407.1905822753906, "logps/rejected": -496.34161376953125, "loss": 0.5149, "rewards/accuracies": 0.6875, "rewards/chosen": -1.690338134765625, "rewards/margins": 0.9209977984428406, "rewards/rejected": -2.6113357543945312, "step": 1800 }, { "epoch": 0.24, "eval_logits/chosen": 1.2750667333602905, "eval_logits/rejected": 2.0261995792388916, "eval_logps/chosen": -439.68988037109375, "eval_logps/rejected": -491.2047424316406, "eval_loss": 0.5408201813697815, "eval_rewards/accuracies": 0.7214999794960022, "eval_rewards/chosen": -1.7506884336471558, "eval_rewards/margins": 0.7155702114105225, "eval_rewards/rejected": -2.4662585258483887, "eval_runtime": 1592.6919, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 1800 }, { "epoch": 0.24, "grad_norm": 12.25, "learning_rate": 4.720500661137397e-06, "logits/chosen": 0.7021459341049194, "logits/rejected": 1.6718536615371704, "logps/chosen": -470.3685607910156, "logps/rejected": -503.8094787597656, "loss": 0.5659, "rewards/accuracies": 0.75, "rewards/chosen": -1.9385855197906494, "rewards/margins": 0.6853693127632141, "rewards/rejected": -2.6239547729492188, "step": 1810 }, { "epoch": 0.24, "grad_norm": 15.5625, "learning_rate": 4.71523021713015e-06, "logits/chosen": 0.9356092214584351, "logits/rejected": 1.5934399366378784, "logps/chosen": -463.17303466796875, "logps/rejected": -509.04864501953125, "loss": 0.6068, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.297934055328369, "rewards/margins": 0.5046521425247192, "rewards/rejected": -2.802586078643799, "step": 1820 }, { "epoch": 0.24, "grad_norm": 18.25, "learning_rate": 4.709913543557761e-06, "logits/chosen": 1.4963045120239258, "logits/rejected": 1.9225801229476929, "logps/chosen": -482.92852783203125, "logps/rejected": -578.1611938476562, "loss": 0.5275, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.498018980026245, "rewards/margins": 1.007016897201538, "rewards/rejected": -3.505035877227783, "step": 1830 }, { "epoch": 0.24, "grad_norm": 9.0625, "learning_rate": 4.704550751373715e-06, "logits/chosen": 1.148832082748413, "logits/rejected": 1.9787461757659912, "logps/chosen": -577.2870483398438, "logps/rejected": -619.6170043945312, "loss": 0.6045, "rewards/accuracies": 0.6875, "rewards/chosen": -2.957585096359253, "rewards/margins": 0.6815073490142822, "rewards/rejected": -3.639092206954956, "step": 1840 }, { "epoch": 0.24, "grad_norm": 4.5, "learning_rate": 4.699141952493941e-06, "logits/chosen": 1.7764909267425537, "logits/rejected": 2.5067789554595947, "logps/chosen": -537.6290893554688, "logps/rejected": -584.9334106445312, "loss": 0.5237, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.823072910308838, "rewards/margins": 0.7261917591094971, "rewards/rejected": -3.549264907836914, "step": 1850 }, { "epoch": 0.24, "grad_norm": 18.625, "learning_rate": 4.6936872597944814e-06, "logits/chosen": 1.7310603857040405, "logits/rejected": 2.559417247772217, "logps/chosen": -464.6128845214844, "logps/rejected": -559.8570556640625, "loss": 0.4748, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.3967392444610596, "rewards/margins": 0.9208498001098633, "rewards/rejected": -3.317589282989502, "step": 1860 }, { "epoch": 0.24, "grad_norm": 20.25, "learning_rate": 4.688186787109136e-06, "logits/chosen": 1.956180214881897, "logits/rejected": 3.564734935760498, "logps/chosen": -528.5537109375, "logps/rejected": -586.9791870117188, "loss": 0.5301, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.75168514251709, "rewards/margins": 0.8507223129272461, "rewards/rejected": -3.602407455444336, "step": 1870 }, { "epoch": 0.25, "grad_norm": 8.3125, "learning_rate": 4.682640649227085e-06, "logits/chosen": 1.8421905040740967, "logits/rejected": 3.96189546585083, "logps/chosen": -570.6896362304688, "logps/rejected": -673.6886596679688, "loss": 0.4183, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9929141998291016, "rewards/margins": 1.4335700273513794, "rewards/rejected": -4.42648458480835, "step": 1880 }, { "epoch": 0.25, "grad_norm": 26.625, "learning_rate": 4.677048961890492e-06, "logits/chosen": 2.172407627105713, "logits/rejected": 2.843996524810791, "logps/chosen": -598.6932373046875, "logps/rejected": -748.4923095703125, "loss": 0.5991, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.3650193214416504, "rewards/margins": 1.2889230251312256, "rewards/rejected": -4.653942108154297, "step": 1890 }, { "epoch": 0.25, "grad_norm": 25.375, "learning_rate": 4.671411841792096e-06, "logits/chosen": 1.5214734077453613, "logits/rejected": 1.8940036296844482, "logps/chosen": -554.8588256835938, "logps/rejected": -598.6972045898438, "loss": 0.6382, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.618443012237549, "rewards/margins": 0.6023415327072144, "rewards/rejected": -3.2207846641540527, "step": 1900 }, { "epoch": 0.25, "eval_logits/chosen": 1.4989570379257202, "eval_logits/rejected": 2.4038922786712646, "eval_logps/chosen": -477.3052062988281, "eval_logps/rejected": -540.05419921875, "eval_loss": 0.5325160622596741, "eval_rewards/accuracies": 0.7254999876022339, "eval_rewards/chosen": -2.1268417835235596, "eval_rewards/margins": 0.8279104232788086, "eval_rewards/rejected": -2.954752206802368, "eval_runtime": 1591.5791, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 1900 }, { "epoch": 0.25, "grad_norm": 11.5, "learning_rate": 4.665729406572764e-06, "logits/chosen": -0.22512850165367126, "logits/rejected": 0.8694722056388855, "logps/chosen": -463.45330810546875, "logps/rejected": -509.54461669921875, "loss": 0.5328, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8640069961547852, "rewards/margins": 0.756619930267334, "rewards/rejected": -2.620626926422119, "step": 1910 }, { "epoch": 0.25, "grad_norm": 11.25, "learning_rate": 4.660001774819048e-06, "logits/chosen": 0.6744810342788696, "logits/rejected": 2.1456856727600098, "logps/chosen": -497.3128967285156, "logps/rejected": -557.4090576171875, "loss": 0.4914, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0291192531585693, "rewards/margins": 0.9803076982498169, "rewards/rejected": -3.0094268321990967, "step": 1920 }, { "epoch": 0.25, "grad_norm": 7.71875, "learning_rate": 4.654229066060702e-06, "logits/chosen": 2.223780870437622, "logits/rejected": 2.437018871307373, "logps/chosen": -439.65240478515625, "logps/rejected": -563.2244873046875, "loss": 0.5364, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4993643760681152, "rewards/margins": 0.9292726516723633, "rewards/rejected": -3.4286370277404785, "step": 1930 }, { "epoch": 0.25, "grad_norm": 9.3125, "learning_rate": 4.648411400768193e-06, "logits/chosen": 1.3439040184020996, "logits/rejected": 2.485945463180542, "logps/chosen": -523.4782104492188, "logps/rejected": -575.5657958984375, "loss": 0.5693, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.6590380668640137, "rewards/margins": 0.8810779452323914, "rewards/rejected": -3.54011607170105, "step": 1940 }, { "epoch": 0.26, "grad_norm": 8.9375, "learning_rate": 4.642548900350182e-06, "logits/chosen": 1.0268100500106812, "logits/rejected": 1.1440961360931396, "logps/chosen": -480.35028076171875, "logps/rejected": -516.804443359375, "loss": 0.6716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.193833827972412, "rewards/margins": 0.443396657705307, "rewards/rejected": -2.637230396270752, "step": 1950 }, { "epoch": 0.26, "grad_norm": 12.0, "learning_rate": 4.636641687150994e-06, "logits/chosen": -0.1693330854177475, "logits/rejected": 0.4488976001739502, "logps/chosen": -465.98529052734375, "logps/rejected": -505.60015869140625, "loss": 0.5583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8904263973236084, "rewards/margins": 0.6098214983940125, "rewards/rejected": -2.5002479553222656, "step": 1960 }, { "epoch": 0.26, "grad_norm": 12.0625, "learning_rate": 4.6306898844480615e-06, "logits/chosen": 0.20104345679283142, "logits/rejected": 1.6518051624298096, "logps/chosen": -539.2623901367188, "logps/rejected": -560.7965698242188, "loss": 0.4878, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1119306087493896, "rewards/margins": 0.8724871873855591, "rewards/rejected": -2.984417676925659, "step": 1970 }, { "epoch": 0.26, "grad_norm": 8.75, "learning_rate": 4.624693616449358e-06, "logits/chosen": 0.3932510018348694, "logits/rejected": 1.3235846757888794, "logps/chosen": -443.524658203125, "logps/rejected": -467.9271545410156, "loss": 0.5635, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0940842628479004, "rewards/margins": 0.68193119764328, "rewards/rejected": -2.7760157585144043, "step": 1980 }, { "epoch": 0.26, "grad_norm": 15.625, "learning_rate": 4.6186530082908e-06, "logits/chosen": 0.6695358157157898, "logits/rejected": 0.8175237774848938, "logps/chosen": -404.23663330078125, "logps/rejected": -461.8468322753906, "loss": 0.6994, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.012613534927368, "rewards/margins": 0.565791130065918, "rewards/rejected": -2.578404664993286, "step": 1990 }, { "epoch": 0.26, "grad_norm": 18.75, "learning_rate": 4.612568186033633e-06, "logits/chosen": -0.0862119197845459, "logits/rejected": 1.158911943435669, "logps/chosen": -453.42706298828125, "logps/rejected": -497.0005798339844, "loss": 0.5178, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5579051971435547, "rewards/margins": 0.8818111419677734, "rewards/rejected": -2.4397165775299072, "step": 2000 }, { "epoch": 0.26, "eval_logits/chosen": 0.8156982064247131, "eval_logits/rejected": 1.528757929801941, "eval_logps/chosen": -406.8324279785156, "eval_logps/rejected": -459.8389587402344, "eval_loss": 0.5275577306747437, "eval_rewards/accuracies": 0.7304999828338623, "eval_rewards/chosen": -1.4221142530441284, "eval_rewards/margins": 0.7304863333702087, "eval_rewards/rejected": -2.1526007652282715, "eval_runtime": 1591.2153, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 2000 }, { "epoch": 0.26, "grad_norm": 11.6875, "learning_rate": 4.6064392766618125e-06, "logits/chosen": 0.2911849617958069, "logits/rejected": 1.4484639167785645, "logps/chosen": -415.20855712890625, "logps/rejected": -454.56243896484375, "loss": 0.4964, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.474690318107605, "rewards/margins": 0.811174213886261, "rewards/rejected": -2.2858645915985107, "step": 2010 }, { "epoch": 0.26, "grad_norm": 12.3125, "learning_rate": 4.60026640807934e-06, "logits/chosen": 1.1492453813552856, "logits/rejected": 1.788846731185913, "logps/chosen": -466.96124267578125, "logps/rejected": -536.4459228515625, "loss": 0.5872, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9817111492156982, "rewards/margins": 0.7023847699165344, "rewards/rejected": -2.684096097946167, "step": 2020 }, { "epoch": 0.27, "grad_norm": 8.9375, "learning_rate": 4.594049709107604e-06, "logits/chosen": 1.0384540557861328, "logits/rejected": 1.8466717004776, "logps/chosen": -509.6580505371094, "logps/rejected": -553.58349609375, "loss": 0.5398, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.156644105911255, "rewards/margins": 0.8427260518074036, "rewards/rejected": -2.9993700981140137, "step": 2030 }, { "epoch": 0.27, "grad_norm": 9.75, "learning_rate": 4.587789309482687e-06, "logits/chosen": 1.0068624019622803, "logits/rejected": 2.298288583755493, "logps/chosen": -425.91693115234375, "logps/rejected": -510.83453369140625, "loss": 0.4572, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.729448676109314, "rewards/margins": 0.987391471862793, "rewards/rejected": -2.7168402671813965, "step": 2040 }, { "epoch": 0.27, "grad_norm": 10.4375, "learning_rate": 4.581485339852659e-06, "logits/chosen": 1.0009605884552002, "logits/rejected": 2.157416582107544, "logps/chosen": -499.15863037109375, "logps/rejected": -580.7846069335938, "loss": 0.5363, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.099259853363037, "rewards/margins": 1.011781930923462, "rewards/rejected": -3.11104154586792, "step": 2050 }, { "epoch": 0.27, "grad_norm": 16.25, "learning_rate": 4.5751379317748514e-06, "logits/chosen": 1.6248559951782227, "logits/rejected": 2.572545051574707, "logps/chosen": -483.1776428222656, "logps/rejected": -605.381103515625, "loss": 0.4643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4639997482299805, "rewards/margins": 1.2903715372085571, "rewards/rejected": -3.754370927810669, "step": 2060 }, { "epoch": 0.27, "grad_norm": 17.25, "learning_rate": 4.56874721771311e-06, "logits/chosen": 1.5964148044586182, "logits/rejected": 2.4404406547546387, "logps/chosen": -534.6270751953125, "logps/rejected": -651.9520263671875, "loss": 0.4478, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.535792827606201, "rewards/margins": 1.3512599468231201, "rewards/rejected": -3.8870530128479004, "step": 2070 }, { "epoch": 0.27, "grad_norm": 20.875, "learning_rate": 4.562313331035032e-06, "logits/chosen": 1.3664486408233643, "logits/rejected": 2.4883315563201904, "logps/chosen": -552.6138305664062, "logps/rejected": -656.958251953125, "loss": 0.4909, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7245543003082275, "rewards/margins": 1.1670589447021484, "rewards/rejected": -3.8916125297546387, "step": 2080 }, { "epoch": 0.27, "grad_norm": 14.25, "learning_rate": 4.555836406009183e-06, "logits/chosen": 0.7134484052658081, "logits/rejected": 1.9421746730804443, "logps/chosen": -539.8816528320312, "logps/rejected": -651.5926513671875, "loss": 0.4775, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.59112811088562, "rewards/margins": 1.2943382263183594, "rewards/rejected": -3.8854668140411377, "step": 2090 }, { "epoch": 0.27, "grad_norm": 19.625, "learning_rate": 4.5493165778022945e-06, "logits/chosen": 1.732616662979126, "logits/rejected": 1.5265815258026123, "logps/chosen": -537.0846557617188, "logps/rejected": -678.6043701171875, "loss": 0.524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.924956798553467, "rewards/margins": 1.0166305303573608, "rewards/rejected": -3.941587448120117, "step": 2100 }, { "epoch": 0.27, "eval_logits/chosen": 1.6624584197998047, "eval_logits/rejected": 2.595505714416504, "eval_logps/chosen": -535.6265869140625, "eval_logps/rejected": -615.3445434570312, "eval_loss": 0.5663179159164429, "eval_rewards/accuracies": 0.7110000252723694, "eval_rewards/chosen": -2.710054874420166, "eval_rewards/margins": 0.9976009130477905, "eval_rewards/rejected": -3.707655906677246, "eval_runtime": 1591.602, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 2100 }, { "epoch": 0.28, "grad_norm": 8.875, "learning_rate": 4.542753982476443e-06, "logits/chosen": 0.8328291177749634, "logits/rejected": 1.094390869140625, "logps/chosen": -505.59417724609375, "logps/rejected": -693.96630859375, "loss": 0.3576, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6475024223327637, "rewards/margins": 1.756042718887329, "rewards/rejected": -4.403544902801514, "step": 2110 }, { "epoch": 0.28, "grad_norm": 22.5, "learning_rate": 4.53614875698621e-06, "logits/chosen": 0.9178856015205383, "logits/rejected": 2.5568594932556152, "logps/chosen": -729.9873046875, "logps/rejected": -763.2886962890625, "loss": 0.9187, "rewards/accuracies": 0.6875, "rewards/chosen": -4.440108299255371, "rewards/margins": 1.0146429538726807, "rewards/rejected": -5.454751491546631, "step": 2120 }, { "epoch": 0.28, "grad_norm": 20.5, "learning_rate": 4.529501039175824e-06, "logits/chosen": 1.178547978401184, "logits/rejected": 2.043323516845703, "logps/chosen": -494.15679931640625, "logps/rejected": -580.7507934570312, "loss": 0.5003, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2062692642211914, "rewards/margins": 1.0896488428115845, "rewards/rejected": -3.2959182262420654, "step": 2130 }, { "epoch": 0.28, "grad_norm": 7.84375, "learning_rate": 4.522810967776287e-06, "logits/chosen": 0.9719040989875793, "logits/rejected": 1.7808208465576172, "logps/chosen": -494.247802734375, "logps/rejected": -565.2037963867188, "loss": 0.5097, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8909088373184204, "rewards/margins": 0.9046257138252258, "rewards/rejected": -2.795534610748291, "step": 2140 }, { "epoch": 0.28, "grad_norm": 23.25, "learning_rate": 4.516078682402473e-06, "logits/chosen": 0.9452457427978516, "logits/rejected": 2.6888928413391113, "logps/chosen": -469.39697265625, "logps/rejected": -555.2576904296875, "loss": 0.524, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2237601280212402, "rewards/margins": 0.8853996396064758, "rewards/rejected": -3.1091599464416504, "step": 2150 }, { "epoch": 0.28, "grad_norm": 24.25, "learning_rate": 4.509304323550221e-06, "logits/chosen": 1.5407735109329224, "logits/rejected": 2.537536859512329, "logps/chosen": -514.2586669921875, "logps/rejected": -597.7869873046875, "loss": 0.5327, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.269601583480835, "rewards/margins": 1.054694652557373, "rewards/rejected": -3.324296236038208, "step": 2160 }, { "epoch": 0.28, "grad_norm": 9.6875, "learning_rate": 4.502488032593398e-06, "logits/chosen": 2.9527904987335205, "logits/rejected": 3.8193678855895996, "logps/chosen": -491.11798095703125, "logps/rejected": -604.221435546875, "loss": 0.508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4709322452545166, "rewards/margins": 1.2204992771148682, "rewards/rejected": -3.6914315223693848, "step": 2170 }, { "epoch": 0.29, "grad_norm": 25.375, "learning_rate": 4.495629951780951e-06, "logits/chosen": 2.891505718231201, "logits/rejected": 2.733696937561035, "logps/chosen": -567.9174194335938, "logps/rejected": -632.1881103515625, "loss": 0.6984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.162008762359619, "rewards/margins": 0.792600691318512, "rewards/rejected": -3.9546093940734863, "step": 2180 }, { "epoch": 0.29, "grad_norm": 23.125, "learning_rate": 4.488730224233941e-06, "logits/chosen": 2.1970956325531006, "logits/rejected": 2.9282336235046387, "logps/chosen": -524.8446044921875, "logps/rejected": -594.5263671875, "loss": 0.5236, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.49332332611084, "rewards/margins": 1.0459768772125244, "rewards/rejected": -3.5393002033233643, "step": 2190 }, { "epoch": 0.29, "grad_norm": 22.25, "learning_rate": 4.481788993942547e-06, "logits/chosen": 2.2267377376556396, "logits/rejected": 3.006579875946045, "logps/chosen": -480.3287048339844, "logps/rejected": -555.52978515625, "loss": 0.523, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2022452354431152, "rewards/margins": 1.0402004718780518, "rewards/rejected": -3.242445468902588, "step": 2200 }, { "epoch": 0.29, "eval_logits/chosen": 2.543581962585449, "eval_logits/rejected": 3.595458984375, "eval_logps/chosen": -493.3342590332031, "eval_logps/rejected": -578.9616088867188, "eval_loss": 0.5422174334526062, "eval_rewards/accuracies": 0.7229999899864197, "eval_rewards/chosen": -2.2871320247650146, "eval_rewards/margins": 1.0566951036453247, "eval_rewards/rejected": -3.343827486038208, "eval_runtime": 1591.1009, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 2200 }, { "epoch": 0.29, "grad_norm": 16.75, "learning_rate": 4.474806405763076e-06, "logits/chosen": 1.3435853719711304, "logits/rejected": 2.144718885421753, "logps/chosen": -518.1646728515625, "logps/rejected": -567.1131591796875, "loss": 0.6472, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.471111297607422, "rewards/margins": 0.698428213596344, "rewards/rejected": -3.169539451599121, "step": 2210 }, { "epoch": 0.29, "grad_norm": 15.25, "learning_rate": 4.4677826054149235e-06, "logits/chosen": 0.6909424066543579, "logits/rejected": 1.4299237728118896, "logps/chosen": -452.12847900390625, "logps/rejected": -496.40625, "loss": 0.5732, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.831134557723999, "rewards/margins": 0.7007611989974976, "rewards/rejected": -2.531895875930786, "step": 2220 }, { "epoch": 0.29, "grad_norm": 7.875, "learning_rate": 4.460717739477543e-06, "logits/chosen": 0.33534538745880127, "logits/rejected": 0.7390525937080383, "logps/chosen": -426.89727783203125, "logps/rejected": -453.79998779296875, "loss": 0.6528, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5729742050170898, "rewards/margins": 0.42158904671669006, "rewards/rejected": -1.994563341140747, "step": 2230 }, { "epoch": 0.29, "grad_norm": 8.0, "learning_rate": 4.4536119553873866e-06, "logits/chosen": -0.13674196600914001, "logits/rejected": 1.186537742614746, "logps/chosen": -401.02313232421875, "logps/rejected": -491.7369689941406, "loss": 0.4291, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5359418392181396, "rewards/margins": 1.016803503036499, "rewards/rejected": -2.5527453422546387, "step": 2240 }, { "epoch": 0.29, "grad_norm": 11.8125, "learning_rate": 4.446465401434824e-06, "logits/chosen": 1.0150126218795776, "logits/rejected": 1.48782479763031, "logps/chosen": -477.7135314941406, "logps/rejected": -568.2080078125, "loss": 0.5028, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1706626415252686, "rewards/margins": 1.0744900703430176, "rewards/rejected": -3.245152235031128, "step": 2250 }, { "epoch": 0.3, "grad_norm": 20.625, "learning_rate": 4.43927822676105e-06, "logits/chosen": 1.2589080333709717, "logits/rejected": 2.024796962738037, "logps/chosen": -509.18017578125, "logps/rejected": -593.2972412109375, "loss": 0.6145, "rewards/accuracies": 0.75, "rewards/chosen": -2.477821111679077, "rewards/margins": 0.9366565942764282, "rewards/rejected": -3.414477825164795, "step": 2260 }, { "epoch": 0.3, "grad_norm": 29.0, "learning_rate": 4.432050581354972e-06, "logits/chosen": 0.7604560852050781, "logits/rejected": 1.928553819656372, "logps/chosen": -517.0828857421875, "logps/rejected": -547.6974487304688, "loss": 0.5421, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4186508655548096, "rewards/margins": 1.0623056888580322, "rewards/rejected": -3.480956554412842, "step": 2270 }, { "epoch": 0.3, "grad_norm": 14.625, "learning_rate": 4.424782616050078e-06, "logits/chosen": 1.0883769989013672, "logits/rejected": 2.2119033336639404, "logps/chosen": -459.88299560546875, "logps/rejected": -527.73193359375, "loss": 0.474, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.022667169570923, "rewards/margins": 1.0422003269195557, "rewards/rejected": -3.0648674964904785, "step": 2280 }, { "epoch": 0.3, "grad_norm": 7.65625, "learning_rate": 4.4174744825212954e-06, "logits/chosen": 0.5496788024902344, "logits/rejected": 2.727740526199341, "logps/chosen": -523.7022094726562, "logps/rejected": -566.1196899414062, "loss": 0.4936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.125478506088257, "rewards/margins": 1.0277146100997925, "rewards/rejected": -3.1531929969787598, "step": 2290 }, { "epoch": 0.3, "grad_norm": 7.46875, "learning_rate": 4.410126333281815e-06, "logits/chosen": 1.8152294158935547, "logits/rejected": 2.9172585010528564, "logps/chosen": -488.97698974609375, "logps/rejected": -560.0440673828125, "loss": 0.5431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2743351459503174, "rewards/margins": 0.9938778877258301, "rewards/rejected": -3.2682127952575684, "step": 2300 }, { "epoch": 0.3, "eval_logits/chosen": 3.20039963722229, "eval_logits/rejected": 4.24326229095459, "eval_logps/chosen": -483.9386901855469, "eval_logps/rejected": -566.412353515625, "eval_loss": 0.52531898021698, "eval_rewards/accuracies": 0.734000027179718, "eval_rewards/chosen": -2.193176507949829, "eval_rewards/margins": 1.0251572132110596, "eval_rewards/rejected": -3.2183339595794678, "eval_runtime": 1591.4881, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 2300 }, { "epoch": 0.3, "grad_norm": 5.9375, "learning_rate": 4.402738321679918e-06, "logits/chosen": 2.252061367034912, "logits/rejected": 3.324723720550537, "logps/chosen": -471.62652587890625, "logps/rejected": -554.5105590820312, "loss": 0.4985, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8434559106826782, "rewards/margins": 1.3258936405181885, "rewards/rejected": -3.169349193572998, "step": 2310 }, { "epoch": 0.3, "grad_norm": 16.25, "learning_rate": 4.395310601895772e-06, "logits/chosen": 2.3367249965667725, "logits/rejected": 3.5756278038024902, "logps/chosen": -453.25933837890625, "logps/rejected": -523.735107421875, "loss": 0.4899, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.076174259185791, "rewards/margins": 1.1950877904891968, "rewards/rejected": -3.271261692047119, "step": 2320 }, { "epoch": 0.3, "grad_norm": 6.40625, "learning_rate": 4.38784332893821e-06, "logits/chosen": 2.454847574234009, "logits/rejected": 3.7376132011413574, "logps/chosen": -522.7352294921875, "logps/rejected": -657.8385009765625, "loss": 0.5333, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.667996883392334, "rewards/margins": 1.2865402698516846, "rewards/rejected": -3.9545376300811768, "step": 2330 }, { "epoch": 0.31, "grad_norm": 16.0, "learning_rate": 4.380336658641503e-06, "logits/chosen": 2.550139904022217, "logits/rejected": 3.860103130340576, "logps/chosen": -598.062744140625, "logps/rejected": -661.4760131835938, "loss": 0.6555, "rewards/accuracies": 0.6875, "rewards/chosen": -3.558239459991455, "rewards/margins": 0.8881810307502747, "rewards/rejected": -4.446420669555664, "step": 2340 }, { "epoch": 0.31, "grad_norm": 7.625, "learning_rate": 4.372790747662101e-06, "logits/chosen": 3.1860485076904297, "logits/rejected": 4.244828224182129, "logps/chosen": -585.7864379882812, "logps/rejected": -661.202880859375, "loss": 0.5341, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.656553268432617, "rewards/margins": 0.9850096702575684, "rewards/rejected": -4.6415629386901855, "step": 2350 }, { "epoch": 0.31, "grad_norm": 19.25, "learning_rate": 4.365205753475367e-06, "logits/chosen": 2.958014488220215, "logits/rejected": 3.7595715522766113, "logps/chosen": -587.1311645507812, "logps/rejected": -652.6437377929688, "loss": 0.5951, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.6158947944641113, "rewards/margins": 0.7533068656921387, "rewards/rejected": -4.369201183319092, "step": 2360 }, { "epoch": 0.31, "grad_norm": 8.9375, "learning_rate": 4.35758183437229e-06, "logits/chosen": 2.5866026878356934, "logits/rejected": 3.5570175647735596, "logps/chosen": -560.3768920898438, "logps/rejected": -637.0328369140625, "loss": 0.4961, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0859694480895996, "rewards/margins": 0.9284812211990356, "rewards/rejected": -4.014450550079346, "step": 2370 }, { "epoch": 0.31, "grad_norm": 20.25, "learning_rate": 4.3499191494561835e-06, "logits/chosen": 3.0706686973571777, "logits/rejected": 3.0112204551696777, "logps/chosen": -586.0211181640625, "logps/rejected": -692.83056640625, "loss": 0.5437, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1680941581726074, "rewards/margins": 0.9448806643486023, "rewards/rejected": -4.112975120544434, "step": 2380 }, { "epoch": 0.31, "grad_norm": 23.0, "learning_rate": 4.3422178586393615e-06, "logits/chosen": 3.2278523445129395, "logits/rejected": 3.951270341873169, "logps/chosen": -556.1135864257812, "logps/rejected": -637.8445434570312, "loss": 0.5511, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0236804485321045, "rewards/margins": 0.825579822063446, "rewards/rejected": -3.8492603302001953, "step": 2390 }, { "epoch": 0.31, "grad_norm": 23.875, "learning_rate": 4.334478122639804e-06, "logits/chosen": 2.0170609951019287, "logits/rejected": 3.2398910522460938, "logps/chosen": -566.9010009765625, "logps/rejected": -622.6240234375, "loss": 0.5147, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.843874454498291, "rewards/margins": 0.7945558428764343, "rewards/rejected": -3.6384308338165283, "step": 2400 }, { "epoch": 0.31, "eval_logits/chosen": 3.6861412525177, "eval_logits/rejected": 4.677193641662598, "eval_logps/chosen": -549.0342407226562, "eval_logps/rejected": -632.5286254882812, "eval_loss": 0.5131849646568298, "eval_rewards/accuracies": 0.7315000295639038, "eval_rewards/chosen": -2.8441319465637207, "eval_rewards/margins": 1.035365104675293, "eval_rewards/rejected": -3.8794968128204346, "eval_runtime": 1590.6945, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 2400 }, { "epoch": 0.32, "grad_norm": 11.375, "learning_rate": 4.3267001029778015e-06, "logits/chosen": 1.7796494960784912, "logits/rejected": 3.778020143508911, "logps/chosen": -546.0569458007812, "logps/rejected": -683.8525390625, "loss": 0.3845, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.809384822845459, "rewards/margins": 1.5442684888839722, "rewards/rejected": -4.353653907775879, "step": 2410 }, { "epoch": 0.32, "grad_norm": 15.25, "learning_rate": 4.318883961972585e-06, "logits/chosen": 2.531733989715576, "logits/rejected": 3.231806516647339, "logps/chosen": -588.2597045898438, "logps/rejected": -721.5579223632812, "loss": 0.3592, "rewards/accuracies": 0.875, "rewards/chosen": -2.820429563522339, "rewards/margins": 1.508816123008728, "rewards/rejected": -4.329245567321777, "step": 2420 }, { "epoch": 0.32, "grad_norm": 13.625, "learning_rate": 4.311029862738942e-06, "logits/chosen": 2.3296523094177246, "logits/rejected": 4.2262067794799805, "logps/chosen": -591.3225708007812, "logps/rejected": -731.8248901367188, "loss": 0.4765, "rewards/accuracies": 0.75, "rewards/chosen": -3.486849308013916, "rewards/margins": 1.516104817390442, "rewards/rejected": -5.00295352935791, "step": 2430 }, { "epoch": 0.32, "grad_norm": 18.25, "learning_rate": 4.303137969183804e-06, "logits/chosen": 3.0578298568725586, "logits/rejected": 3.899970531463623, "logps/chosen": -667.5275268554688, "logps/rejected": -920.3175659179688, "loss": 0.3092, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.254639625549316, "rewards/margins": 2.5148143768310547, "rewards/rejected": -6.7694549560546875, "step": 2440 }, { "epoch": 0.32, "grad_norm": 33.5, "learning_rate": 4.295208446002832e-06, "logits/chosen": 3.725722551345825, "logits/rejected": 4.002905368804932, "logps/chosen": -707.6356811523438, "logps/rejected": -930.8513793945312, "loss": 0.5664, "rewards/accuracies": 0.75, "rewards/chosen": -4.71712589263916, "rewards/margins": 1.9741309881210327, "rewards/rejected": -6.691256523132324, "step": 2450 }, { "epoch": 0.32, "grad_norm": 24.125, "learning_rate": 4.287241458676981e-06, "logits/chosen": 2.3860459327697754, "logits/rejected": 3.560899257659912, "logps/chosen": -657.5707397460938, "logps/rejected": -731.2125244140625, "loss": 0.7209, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.795893430709839, "rewards/margins": 1.0225095748901367, "rewards/rejected": -4.818403244018555, "step": 2460 }, { "epoch": 0.32, "grad_norm": 6.53125, "learning_rate": 4.279237173469043e-06, "logits/chosen": 0.9039813876152039, "logits/rejected": 2.137092351913452, "logps/chosen": -501.758056640625, "logps/rejected": -549.93896484375, "loss": 0.5286, "rewards/accuracies": 0.75, "rewards/chosen": -2.340841770172119, "rewards/margins": 1.0801969766616821, "rewards/rejected": -3.4210383892059326, "step": 2470 }, { "epoch": 0.32, "grad_norm": 8.4375, "learning_rate": 4.271195757420177e-06, "logits/chosen": 1.0410025119781494, "logits/rejected": 1.2670671939849854, "logps/chosen": -480.5115661621094, "logps/rejected": -636.4711303710938, "loss": 0.498, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0763466358184814, "rewards/margins": 1.0617152452468872, "rewards/rejected": -3.138062000274658, "step": 2480 }, { "epoch": 0.33, "grad_norm": 10.375, "learning_rate": 4.263117378346425e-06, "logits/chosen": 1.2630422115325928, "logits/rejected": 2.5274930000305176, "logps/chosen": -476.0921325683594, "logps/rejected": -531.6831665039062, "loss": 0.5598, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.141141414642334, "rewards/margins": 1.0511398315429688, "rewards/rejected": -3.1922812461853027, "step": 2490 }, { "epoch": 0.33, "grad_norm": 15.5, "learning_rate": 4.255002204835208e-06, "logits/chosen": 1.1449196338653564, "logits/rejected": 1.7821890115737915, "logps/chosen": -445.55047607421875, "logps/rejected": -591.94580078125, "loss": 0.4198, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1304402351379395, "rewards/margins": 1.2741626501083374, "rewards/rejected": -3.4046032428741455, "step": 2500 }, { "epoch": 0.33, "eval_logits/chosen": 1.8510550260543823, "eval_logits/rejected": 2.794989824295044, "eval_logps/chosen": -482.1783447265625, "eval_logps/rejected": -559.00537109375, "eval_loss": 0.5213505029678345, "eval_rewards/accuracies": 0.7289999723434448, "eval_rewards/chosen": -2.1755733489990234, "eval_rewards/margins": 0.9686914086341858, "eval_rewards/rejected": -3.1442646980285645, "eval_runtime": 1592.0183, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 2500 }, { "epoch": 0.33, "grad_norm": 9.625, "learning_rate": 4.246850406241812e-06, "logits/chosen": 0.7812983393669128, "logits/rejected": 1.765454649925232, "logps/chosen": -497.0265197753906, "logps/rejected": -563.72216796875, "loss": 0.4645, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.216409683227539, "rewards/margins": 0.9073765873908997, "rewards/rejected": -3.123786211013794, "step": 2510 }, { "epoch": 0.33, "grad_norm": 22.125, "learning_rate": 4.2386621526858465e-06, "logits/chosen": 1.0584173202514648, "logits/rejected": 2.6873929500579834, "logps/chosen": -561.4659423828125, "logps/rejected": -631.4401245117188, "loss": 0.5197, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.6689488887786865, "rewards/margins": 1.138954758644104, "rewards/rejected": -3.80790376663208, "step": 2520 }, { "epoch": 0.33, "grad_norm": 33.0, "learning_rate": 4.2304376150477015e-06, "logits/chosen": 1.5355218648910522, "logits/rejected": 2.0197160243988037, "logps/chosen": -534.4373779296875, "logps/rejected": -681.6929931640625, "loss": 0.485, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7461304664611816, "rewards/margins": 1.2848045825958252, "rewards/rejected": -4.030934810638428, "step": 2530 }, { "epoch": 0.33, "grad_norm": 16.625, "learning_rate": 4.222176964964977e-06, "logits/chosen": 1.1447181701660156, "logits/rejected": 2.849592685699463, "logps/chosen": -565.1495361328125, "logps/rejected": -607.2949829101562, "loss": 0.7026, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.872459888458252, "rewards/margins": 0.8781876564025879, "rewards/rejected": -3.7506473064422607, "step": 2540 }, { "epoch": 0.33, "grad_norm": 12.0625, "learning_rate": 4.213880374828903e-06, "logits/chosen": 1.039074420928955, "logits/rejected": 3.4681191444396973, "logps/chosen": -507.37188720703125, "logps/rejected": -588.2506713867188, "loss": 0.3846, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.539095401763916, "rewards/margins": 1.507993459701538, "rewards/rejected": -4.047088623046875, "step": 2550 }, { "epoch": 0.33, "grad_norm": 16.625, "learning_rate": 4.2055480177807406e-06, "logits/chosen": 1.8058216571807861, "logits/rejected": 2.808535575866699, "logps/chosen": -577.8333740234375, "logps/rejected": -715.6800537109375, "loss": 0.5481, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1704020500183105, "rewards/margins": 1.268444299697876, "rewards/rejected": -4.438846588134766, "step": 2560 }, { "epoch": 0.34, "grad_norm": 38.25, "learning_rate": 4.1971800677081696e-06, "logits/chosen": 1.7406212091445923, "logits/rejected": 2.802582263946533, "logps/chosen": -610.0017700195312, "logps/rejected": -673.1057739257812, "loss": 0.5932, "rewards/accuracies": 0.75, "rewards/chosen": -3.347029209136963, "rewards/margins": 1.028158187866211, "rewards/rejected": -4.375187397003174, "step": 2570 }, { "epoch": 0.34, "grad_norm": 10.625, "learning_rate": 4.188776699241661e-06, "logits/chosen": 0.8740745782852173, "logits/rejected": 2.8264269828796387, "logps/chosen": -645.9588012695312, "logps/rejected": -742.9510498046875, "loss": 0.4183, "rewards/accuracies": 0.875, "rewards/chosen": -3.5243937969207764, "rewards/margins": 1.566954255104065, "rewards/rejected": -5.091347694396973, "step": 2580 }, { "epoch": 0.34, "grad_norm": 14.5625, "learning_rate": 4.180338087750827e-06, "logits/chosen": 1.4372971057891846, "logits/rejected": 2.609794855117798, "logps/chosen": -664.0543212890625, "logps/rejected": -711.6912841796875, "loss": 0.6018, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.798950672149658, "rewards/margins": 1.0835754871368408, "rewards/rejected": -4.88252592086792, "step": 2590 }, { "epoch": 0.34, "grad_norm": 5.75, "learning_rate": 4.1718644093407704e-06, "logits/chosen": 1.4924824237823486, "logits/rejected": 3.1547598838806152, "logps/chosen": -639.6531982421875, "logps/rejected": -705.1632080078125, "loss": 0.5994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.5833652019500732, "rewards/margins": 0.9800311923027039, "rewards/rejected": -4.563396453857422, "step": 2600 }, { "epoch": 0.34, "eval_logits/chosen": 2.444972515106201, "eval_logits/rejected": 3.4511168003082275, "eval_logps/chosen": -577.7604370117188, "eval_logps/rejected": -663.0682983398438, "eval_loss": 0.5188149809837341, "eval_rewards/accuracies": 0.7289999723434448, "eval_rewards/chosen": -3.1313939094543457, "eval_rewards/margins": 1.0535000562667847, "eval_rewards/rejected": -4.18489408493042, "eval_runtime": 1591.8209, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 2600 }, { "epoch": 0.34, "grad_norm": 9.1875, "learning_rate": 4.163355840848401e-06, "logits/chosen": 0.8957176208496094, "logits/rejected": 2.534245491027832, "logps/chosen": -527.8390502929688, "logps/rejected": -632.1031494140625, "loss": 0.3694, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.750108242034912, "rewards/margins": 1.2850162982940674, "rewards/rejected": -4.035124778747559, "step": 2610 }, { "epoch": 0.34, "grad_norm": 7.875, "learning_rate": 4.154812559838748e-06, "logits/chosen": 1.8887290954589844, "logits/rejected": 3.2408337593078613, "logps/chosen": -583.8853149414062, "logps/rejected": -626.079345703125, "loss": 0.6086, "rewards/accuracies": 0.6875, "rewards/chosen": -3.222743511199951, "rewards/margins": 0.9295894503593445, "rewards/rejected": -4.1523332595825195, "step": 2620 }, { "epoch": 0.34, "grad_norm": 11.0625, "learning_rate": 4.146234744601259e-06, "logits/chosen": 1.4272115230560303, "logits/rejected": 2.444230079650879, "logps/chosen": -493.49884033203125, "logps/rejected": -589.1790771484375, "loss": 0.4433, "rewards/accuracies": 0.75, "rewards/chosen": -2.5310328006744385, "rewards/margins": 1.1930855512619019, "rewards/rejected": -3.72411847114563, "step": 2630 }, { "epoch": 0.35, "grad_norm": 14.3125, "learning_rate": 4.137622574146071e-06, "logits/chosen": 0.8492835164070129, "logits/rejected": 1.681236982345581, "logps/chosen": -459.9530334472656, "logps/rejected": -513.4580078125, "loss": 0.5401, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9301316738128662, "rewards/margins": 0.7894424200057983, "rewards/rejected": -2.719574451446533, "step": 2640 }, { "epoch": 0.35, "grad_norm": 13.375, "learning_rate": 4.12897622820028e-06, "logits/chosen": 1.0547001361846924, "logits/rejected": 2.3231730461120605, "logps/chosen": -474.957275390625, "logps/rejected": -489.51934814453125, "loss": 0.5108, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0215628147125244, "rewards/margins": 0.9034984707832336, "rewards/rejected": -2.9250614643096924, "step": 2650 }, { "epoch": 0.35, "grad_norm": 14.8125, "learning_rate": 4.120295887204191e-06, "logits/chosen": 1.256225824356079, "logits/rejected": 1.9702694416046143, "logps/chosen": -473.0255432128906, "logps/rejected": -558.8773193359375, "loss": 0.59, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.428616762161255, "rewards/margins": 0.8970969319343567, "rewards/rejected": -3.325714111328125, "step": 2660 }, { "epoch": 0.35, "grad_norm": 13.5625, "learning_rate": 4.111581732307548e-06, "logits/chosen": 0.9635303616523743, "logits/rejected": 1.6423721313476562, "logps/chosen": -512.5065307617188, "logps/rejected": -541.4960327148438, "loss": 0.5615, "rewards/accuracies": 0.75, "rewards/chosen": -2.5323760509490967, "rewards/margins": 0.6085655093193054, "rewards/rejected": -3.140941619873047, "step": 2670 }, { "epoch": 0.35, "grad_norm": 10.6875, "learning_rate": 4.1028339453657595e-06, "logits/chosen": 1.0258954763412476, "logits/rejected": 1.8250477313995361, "logps/chosen": -522.21484375, "logps/rejected": -591.0906982421875, "loss": 0.4589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.263209819793701, "rewards/margins": 1.0161672830581665, "rewards/rejected": -3.2793774604797363, "step": 2680 }, { "epoch": 0.35, "grad_norm": 7.1875, "learning_rate": 4.094052708936096e-06, "logits/chosen": 1.2521384954452515, "logits/rejected": 3.028597354888916, "logps/chosen": -585.7796630859375, "logps/rejected": -675.7745361328125, "loss": 0.4939, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0005898475646973, "rewards/margins": 1.0883712768554688, "rewards/rejected": -4.088961124420166, "step": 2690 }, { "epoch": 0.35, "grad_norm": 9.9375, "learning_rate": 4.0852382062738874e-06, "logits/chosen": 1.6026700735092163, "logits/rejected": 3.266209125518799, "logps/chosen": -545.3231811523438, "logps/rejected": -654.74365234375, "loss": 0.4812, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0284156799316406, "rewards/margins": 1.3262989521026611, "rewards/rejected": -4.354714393615723, "step": 2700 }, { "epoch": 0.35, "eval_logits/chosen": 2.7916452884674072, "eval_logits/rejected": 3.776010274887085, "eval_logps/chosen": -565.985107421875, "eval_logps/rejected": -655.18212890625, "eval_loss": 0.5139148235321045, "eval_rewards/accuracies": 0.7455000281333923, "eval_rewards/chosen": -3.013641119003296, "eval_rewards/margins": 1.0923913717269897, "eval_rewards/rejected": -4.106032371520996, "eval_runtime": 1592.1095, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 2700 }, { "epoch": 0.35, "grad_norm": 10.3125, "learning_rate": 4.076390621328693e-06, "logits/chosen": 1.6921123266220093, "logits/rejected": 3.050382137298584, "logps/chosen": -557.9287719726562, "logps/rejected": -709.6575317382812, "loss": 0.378, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.9646167755126953, "rewards/margins": 1.6953115463256836, "rewards/rejected": -4.659928321838379, "step": 2710 }, { "epoch": 0.36, "grad_norm": 14.0625, "learning_rate": 4.067510138740467e-06, "logits/chosen": 1.3908737897872925, "logits/rejected": 2.611626625061035, "logps/chosen": -581.0802001953125, "logps/rejected": -624.2890625, "loss": 0.5162, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.96773099899292, "rewards/margins": 1.0839290618896484, "rewards/rejected": -4.051660060882568, "step": 2720 }, { "epoch": 0.36, "grad_norm": 23.25, "learning_rate": 4.058596943835703e-06, "logits/chosen": 1.9080839157104492, "logits/rejected": 2.4143242835998535, "logps/chosen": -556.7078857421875, "logps/rejected": -639.7720947265625, "loss": 0.4337, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0165634155273438, "rewards/margins": 1.1963437795639038, "rewards/rejected": -4.212907314300537, "step": 2730 }, { "epoch": 0.36, "grad_norm": 15.375, "learning_rate": 4.049651222623568e-06, "logits/chosen": 1.776745080947876, "logits/rejected": 1.8187005519866943, "logps/chosen": -623.6613159179688, "logps/rejected": -740.4357299804688, "loss": 0.5837, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6197190284729004, "rewards/margins": 1.086064338684082, "rewards/rejected": -4.705783843994141, "step": 2740 }, { "epoch": 0.36, "grad_norm": 13.0625, "learning_rate": 4.040673161792014e-06, "logits/chosen": 1.0641945600509644, "logits/rejected": 2.175415277481079, "logps/chosen": -627.2669677734375, "logps/rejected": -683.8802490234375, "loss": 0.5289, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2108218669891357, "rewards/margins": 0.9524551630020142, "rewards/rejected": -4.163276672363281, "step": 2750 }, { "epoch": 0.36, "grad_norm": 12.0625, "learning_rate": 4.031662948703896e-06, "logits/chosen": 1.3657619953155518, "logits/rejected": 2.5233426094055176, "logps/chosen": -588.10595703125, "logps/rejected": -648.07666015625, "loss": 0.5145, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9469008445739746, "rewards/margins": 0.9836384057998657, "rewards/rejected": -3.93053936958313, "step": 2760 }, { "epoch": 0.36, "grad_norm": 14.4375, "learning_rate": 4.022620771393047e-06, "logits/chosen": 1.4988666772842407, "logits/rejected": 2.586674213409424, "logps/chosen": -627.62744140625, "logps/rejected": -679.7816162109375, "loss": 0.5464, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.4070639610290527, "rewards/margins": 0.8845176696777344, "rewards/rejected": -4.291582107543945, "step": 2770 }, { "epoch": 0.36, "grad_norm": 13.3125, "learning_rate": 4.013546818560362e-06, "logits/chosen": 1.5694141387939453, "logits/rejected": 2.882098436355591, "logps/chosen": -605.7820434570312, "logps/rejected": -676.2273559570312, "loss": 0.4648, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2976009845733643, "rewards/margins": 1.179705023765564, "rewards/rejected": -4.4773054122924805, "step": 2780 }, { "epoch": 0.37, "grad_norm": 17.75, "learning_rate": 4.00444127956986e-06, "logits/chosen": 1.1843597888946533, "logits/rejected": 2.589926242828369, "logps/chosen": -620.7827758789062, "logps/rejected": -640.5551147460938, "loss": 0.5387, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.944465160369873, "rewards/margins": 0.9514845609664917, "rewards/rejected": -3.895949602127075, "step": 2790 }, { "epoch": 0.37, "grad_norm": 7.25, "learning_rate": 3.9953043444447255e-06, "logits/chosen": 1.532142996788025, "logits/rejected": 2.036409854888916, "logps/chosen": -530.7525634765625, "logps/rejected": -674.3682861328125, "loss": 0.4696, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.759455680847168, "rewards/margins": 1.3281762599945068, "rewards/rejected": -4.087632179260254, "step": 2800 }, { "epoch": 0.37, "eval_logits/chosen": 1.8288776874542236, "eval_logits/rejected": 2.6756598949432373, "eval_logps/chosen": -487.6709289550781, "eval_logps/rejected": -568.2573852539062, "eval_loss": 0.5136556625366211, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -2.230499505996704, "eval_rewards/margins": 1.0062857866287231, "eval_rewards/rejected": -3.2367851734161377, "eval_runtime": 1591.8133, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 2800 }, { "epoch": 0.37, "grad_norm": 8.125, "learning_rate": 3.986136203863355e-06, "logits/chosen": 0.38360413908958435, "logits/rejected": 1.1356967687606812, "logps/chosen": -421.0419006347656, "logps/rejected": -479.0397033691406, "loss": 0.5937, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9757299423217773, "rewards/margins": 0.8018991351127625, "rewards/rejected": -2.7776291370391846, "step": 2810 }, { "epoch": 0.37, "grad_norm": 17.625, "learning_rate": 3.976937049155365e-06, "logits/chosen": -0.2097533941268921, "logits/rejected": 0.9864130020141602, "logps/chosen": -423.400390625, "logps/rejected": -491.6302185058594, "loss": 0.5371, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.563111424446106, "rewards/margins": 0.8165823817253113, "rewards/rejected": -2.3796939849853516, "step": 2820 }, { "epoch": 0.37, "grad_norm": 13.8125, "learning_rate": 3.967707072297608e-06, "logits/chosen": 0.0686485767364502, "logits/rejected": 0.5922690629959106, "logps/chosen": -426.5838317871094, "logps/rejected": -479.63543701171875, "loss": 0.575, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5743776559829712, "rewards/margins": 0.6596744656562805, "rewards/rejected": -2.2340521812438965, "step": 2830 }, { "epoch": 0.37, "grad_norm": 24.75, "learning_rate": 3.958446465910159e-06, "logits/chosen": 0.03275877237319946, "logits/rejected": 1.1897265911102295, "logps/chosen": -419.6893615722656, "logps/rejected": -487.18505859375, "loss": 0.4558, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.679578185081482, "rewards/margins": 1.0936287641525269, "rewards/rejected": -2.773206949234009, "step": 2840 }, { "epoch": 0.37, "grad_norm": 30.25, "learning_rate": 3.9491554232523066e-06, "logits/chosen": 1.2540711164474487, "logits/rejected": 1.7490119934082031, "logps/chosen": -571.7868041992188, "logps/rejected": -663.6853637695312, "loss": 0.505, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0915579795837402, "rewards/margins": 1.137508511543274, "rewards/rejected": -4.229066371917725, "step": 2850 }, { "epoch": 0.37, "grad_norm": 8.8125, "learning_rate": 3.939834138218505e-06, "logits/chosen": 1.5485732555389404, "logits/rejected": 2.4029109477996826, "logps/chosen": -579.219970703125, "logps/rejected": -664.6459350585938, "loss": 0.6115, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1892542839050293, "rewards/margins": 1.0732507705688477, "rewards/rejected": -4.262505054473877, "step": 2860 }, { "epoch": 0.38, "grad_norm": 20.75, "learning_rate": 3.930482805334339e-06, "logits/chosen": 0.921392560005188, "logits/rejected": 1.3949135541915894, "logps/chosen": -425.5755310058594, "logps/rejected": -598.9391479492188, "loss": 0.4478, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.347729206085205, "rewards/margins": 1.4377492666244507, "rewards/rejected": -3.7854785919189453, "step": 2870 }, { "epoch": 0.38, "grad_norm": 9.8125, "learning_rate": 3.921101619752464e-06, "logits/chosen": 0.45665979385375977, "logits/rejected": 1.0926839113235474, "logps/chosen": -460.4278259277344, "logps/rejected": -552.573486328125, "loss": 0.4864, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.01448917388916, "rewards/margins": 1.2656866312026978, "rewards/rejected": -3.2801756858825684, "step": 2880 }, { "epoch": 0.38, "grad_norm": 10.6875, "learning_rate": 3.911690777248525e-06, "logits/chosen": 0.17906469106674194, "logits/rejected": 0.6382294297218323, "logps/chosen": -469.03167724609375, "logps/rejected": -578.2459716796875, "loss": 0.494, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.113701581954956, "rewards/margins": 1.1188174486160278, "rewards/rejected": -3.2325191497802734, "step": 2890 }, { "epoch": 0.38, "grad_norm": 21.875, "learning_rate": 3.902250474217079e-06, "logits/chosen": -0.15756431221961975, "logits/rejected": 1.098966360092163, "logps/chosen": -430.4964294433594, "logps/rejected": -522.6734619140625, "loss": 0.5418, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.887865662574768, "rewards/margins": 1.009726643562317, "rewards/rejected": -2.897592067718506, "step": 2900 }, { "epoch": 0.38, "eval_logits/chosen": 1.189926028251648, "eval_logits/rejected": 2.0189223289489746, "eval_logps/chosen": -471.02703857421875, "eval_logps/rejected": -559.2019653320312, "eval_loss": 0.5176796317100525, "eval_rewards/accuracies": 0.734499990940094, "eval_rewards/chosen": -2.0640602111816406, "eval_rewards/margins": 1.0821698904037476, "eval_rewards/rejected": -3.1462302207946777, "eval_runtime": 1591.2862, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 2900 }, { "epoch": 0.38, "grad_norm": 9.75, "learning_rate": 3.892780907667495e-06, "logits/chosen": 0.06123056262731552, "logits/rejected": 0.9262319803237915, "logps/chosen": -484.229248046875, "logps/rejected": -581.5013427734375, "loss": 0.4237, "rewards/accuracies": 0.8125, "rewards/chosen": -1.977403998374939, "rewards/margins": 1.2522307634353638, "rewards/rejected": -3.229635238647461, "step": 2910 }, { "epoch": 0.38, "grad_norm": 11.5625, "learning_rate": 3.883282275219837e-06, "logits/chosen": 0.5973024368286133, "logits/rejected": 1.3094433546066284, "logps/chosen": -473.7518005371094, "logps/rejected": -624.4464111328125, "loss": 0.4704, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.3923592567443848, "rewards/margins": 1.4723341464996338, "rewards/rejected": -3.8646934032440186, "step": 2920 }, { "epoch": 0.38, "grad_norm": 7.21875, "learning_rate": 3.873754775100751e-06, "logits/chosen": 0.2925790548324585, "logits/rejected": 1.3715248107910156, "logps/chosen": -503.6017150878906, "logps/rejected": -597.2714233398438, "loss": 0.4469, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3737165927886963, "rewards/margins": 1.3970870971679688, "rewards/rejected": -3.770803928375244, "step": 2930 }, { "epoch": 0.38, "grad_norm": 22.0, "learning_rate": 3.8641986061393145e-06, "logits/chosen": 0.028101766481995583, "logits/rejected": 1.5201627016067505, "logps/chosen": -550.6173095703125, "logps/rejected": -695.3499755859375, "loss": 0.4544, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7320361137390137, "rewards/margins": 1.6946220397949219, "rewards/rejected": -4.426657676696777, "step": 2940 }, { "epoch": 0.39, "grad_norm": 15.875, "learning_rate": 3.854613967762898e-06, "logits/chosen": 0.7224695682525635, "logits/rejected": 1.902295470237732, "logps/chosen": -589.226318359375, "logps/rejected": -696.6246337890625, "loss": 0.4663, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0647222995758057, "rewards/margins": 1.5787670612335205, "rewards/rejected": -4.643489360809326, "step": 2950 }, { "epoch": 0.39, "grad_norm": 24.5, "learning_rate": 3.845001059992999e-06, "logits/chosen": 1.1261330842971802, "logits/rejected": 2.1730661392211914, "logps/chosen": -602.8712768554688, "logps/rejected": -735.8431396484375, "loss": 0.4671, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3659489154815674, "rewards/margins": 1.5933061838150024, "rewards/rejected": -4.959255218505859, "step": 2960 }, { "epoch": 0.39, "grad_norm": 16.75, "learning_rate": 3.835360083441067e-06, "logits/chosen": 0.768014132976532, "logits/rejected": 1.7233359813690186, "logps/chosen": -632.5802612304688, "logps/rejected": -771.4032592773438, "loss": 0.4379, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.644712448120117, "rewards/margins": 1.5702012777328491, "rewards/rejected": -5.214913368225098, "step": 2970 }, { "epoch": 0.39, "grad_norm": 18.0, "learning_rate": 3.825691239304318e-06, "logits/chosen": 0.6955349445343018, "logits/rejected": 2.1932034492492676, "logps/chosen": -729.919189453125, "logps/rejected": -745.0521240234375, "loss": 0.7656, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.9248478412628174, "rewards/margins": 0.9639946222305298, "rewards/rejected": -4.888842582702637, "step": 2980 }, { "epoch": 0.39, "grad_norm": 26.875, "learning_rate": 3.8159947293615385e-06, "logits/chosen": 0.7099810838699341, "logits/rejected": 1.620305061340332, "logps/chosen": -573.0277099609375, "logps/rejected": -639.2322998046875, "loss": 0.4878, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8682754039764404, "rewards/margins": 1.046812653541565, "rewards/rejected": -3.915087938308716, "step": 2990 }, { "epoch": 0.39, "grad_norm": 15.875, "learning_rate": 3.806270755968866e-06, "logits/chosen": 1.8222758769989014, "logits/rejected": 2.27435040473938, "logps/chosen": -488.9098205566406, "logps/rejected": -600.2335815429688, "loss": 0.5068, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6705269813537598, "rewards/margins": 1.1377004384994507, "rewards/rejected": -3.808227062225342, "step": 3000 }, { "epoch": 0.39, "eval_logits/chosen": 2.0023179054260254, "eval_logits/rejected": 2.8678812980651855, "eval_logps/chosen": -510.2568664550781, "eval_logps/rejected": -601.0542602539062, "eval_loss": 0.5096300840377808, "eval_rewards/accuracies": 0.7400000095367432, "eval_rewards/chosen": -2.4563584327697754, "eval_rewards/margins": 1.1083952188491821, "eval_rewards/rejected": -3.564753770828247, "eval_runtime": 1591.3168, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 3000 }, { "epoch": 0.39, "grad_norm": 13.3125, "learning_rate": 3.7965195220555784e-06, "logits/chosen": 1.7134437561035156, "logits/rejected": 2.0564117431640625, "logps/chosen": -479.67449951171875, "logps/rejected": -604.8666381835938, "loss": 0.5536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6466455459594727, "rewards/margins": 1.1505428552627563, "rewards/rejected": -3.7971885204315186, "step": 3010 }, { "epoch": 0.4, "grad_norm": 12.0, "learning_rate": 3.786741231119847e-06, "logits/chosen": 0.3799988329410553, "logits/rejected": 1.70327889919281, "logps/chosen": -503.31268310546875, "logps/rejected": -576.9793701171875, "loss": 0.4446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9507548809051514, "rewards/margins": 1.2016483545303345, "rewards/rejected": -3.1524033546447754, "step": 3020 }, { "epoch": 0.4, "grad_norm": 5.53125, "learning_rate": 3.7769360872244992e-06, "logits/chosen": 1.170353889465332, "logits/rejected": 1.2406959533691406, "logps/chosen": -431.1206970214844, "logps/rejected": -513.315185546875, "loss": 0.4801, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9646472930908203, "rewards/margins": 0.9679223895072937, "rewards/rejected": -2.932569980621338, "step": 3030 }, { "epoch": 0.4, "grad_norm": 15.875, "learning_rate": 3.767104294992754e-06, "logits/chosen": 0.9803180694580078, "logits/rejected": 2.352583885192871, "logps/chosen": -454.54541015625, "logps/rejected": -532.4876098632812, "loss": 0.46, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.114039659500122, "rewards/margins": 1.1198477745056152, "rewards/rejected": -3.2338874340057373, "step": 3040 }, { "epoch": 0.4, "grad_norm": 8.0, "learning_rate": 3.7572460596039524e-06, "logits/chosen": 0.7494764924049377, "logits/rejected": 1.6844794750213623, "logps/chosen": -467.81005859375, "logps/rejected": -567.791748046875, "loss": 0.5172, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2958292961120605, "rewards/margins": 1.046289086341858, "rewards/rejected": -3.342118501663208, "step": 3050 }, { "epoch": 0.4, "grad_norm": 4.8125, "learning_rate": 3.74736158678928e-06, "logits/chosen": 0.8211283683776855, "logits/rejected": 1.3175485134124756, "logps/chosen": -428.0367736816406, "logps/rejected": -535.9208374023438, "loss": 0.4252, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8630352020263672, "rewards/margins": 1.1972407102584839, "rewards/rejected": -3.0602760314941406, "step": 3060 }, { "epoch": 0.4, "grad_norm": 8.875, "learning_rate": 3.7374510828274673e-06, "logits/chosen": 0.7421783208847046, "logits/rejected": 1.331279993057251, "logps/chosen": -464.83795166015625, "logps/rejected": -571.86767578125, "loss": 0.5101, "rewards/accuracies": 0.75, "rewards/chosen": -2.2767622470855713, "rewards/margins": 1.0495450496673584, "rewards/rejected": -3.3263068199157715, "step": 3070 }, { "epoch": 0.4, "grad_norm": 14.125, "learning_rate": 3.72751475454049e-06, "logits/chosen": 0.8095995783805847, "logits/rejected": 1.2267462015151978, "logps/chosen": -514.5219116210938, "logps/rejected": -657.3064575195312, "loss": 0.5369, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.805443048477173, "rewards/margins": 1.2120649814605713, "rewards/rejected": -4.017508029937744, "step": 3080 }, { "epoch": 0.4, "grad_norm": 37.0, "learning_rate": 3.7175528092892503e-06, "logits/chosen": 0.6734537482261658, "logits/rejected": 0.9628368616104126, "logps/chosen": -592.8623046875, "logps/rejected": -679.498046875, "loss": 0.6657, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.2352747917175293, "rewards/margins": 0.9270402789115906, "rewards/rejected": -4.162315368652344, "step": 3090 }, { "epoch": 0.41, "grad_norm": 15.9375, "learning_rate": 3.7075654549692498e-06, "logits/chosen": 0.25260859727859497, "logits/rejected": 1.0015579462051392, "logps/chosen": -547.7056884765625, "logps/rejected": -655.2227783203125, "loss": 0.4429, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.885751962661743, "rewards/margins": 1.3275845050811768, "rewards/rejected": -4.213336944580078, "step": 3100 }, { "epoch": 0.41, "eval_logits/chosen": 0.6491054892539978, "eval_logits/rejected": 1.3309341669082642, "eval_logps/chosen": -540.056640625, "eval_logps/rejected": -633.2681884765625, "eval_loss": 0.5323993563652039, "eval_rewards/accuracies": 0.7179999947547913, "eval_rewards/chosen": -2.7543554306030273, "eval_rewards/margins": 1.1325373649597168, "eval_rewards/rejected": -3.886892557144165, "eval_runtime": 1591.7657, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 3100 }, { "epoch": 0.41, "grad_norm": 13.1875, "learning_rate": 3.697552900006249e-06, "logits/chosen": 0.2841110825538635, "logits/rejected": 0.6535122990608215, "logps/chosen": -558.1544189453125, "logps/rejected": -608.4871826171875, "loss": 0.5408, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.807015895843506, "rewards/margins": 0.8503965139389038, "rewards/rejected": -3.65741229057312, "step": 3110 }, { "epoch": 0.41, "grad_norm": 10.625, "learning_rate": 3.6875153533519244e-06, "logits/chosen": 0.0682436004281044, "logits/rejected": 0.5094722509384155, "logps/chosen": -544.4560546875, "logps/rejected": -663.2474975585938, "loss": 0.5958, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9370124340057373, "rewards/margins": 0.9294888377189636, "rewards/rejected": -3.8665013313293457, "step": 3120 }, { "epoch": 0.41, "grad_norm": 49.75, "learning_rate": 3.6774530244794992e-06, "logits/chosen": 0.13949742913246155, "logits/rejected": 0.7011431455612183, "logps/chosen": -490.24969482421875, "logps/rejected": -611.5720825195312, "loss": 0.5313, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.532118797302246, "rewards/margins": 1.2002549171447754, "rewards/rejected": -3.7323734760284424, "step": 3130 }, { "epoch": 0.41, "grad_norm": 10.75, "learning_rate": 3.667366123379378e-06, "logits/chosen": 0.017296016216278076, "logits/rejected": 0.8059386014938354, "logps/chosen": -509.57171630859375, "logps/rejected": -552.8974609375, "loss": 0.5477, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5200209617614746, "rewards/margins": 0.8663853406906128, "rewards/rejected": -3.3864059448242188, "step": 3140 }, { "epoch": 0.41, "grad_norm": 10.5625, "learning_rate": 3.6572548605547607e-06, "logits/chosen": 0.2774070203304291, "logits/rejected": 1.5602456331253052, "logps/chosen": -511.4872131347656, "logps/rejected": -610.0732421875, "loss": 0.3978, "rewards/accuracies": 0.8125, "rewards/chosen": -2.307269811630249, "rewards/margins": 1.3458932638168335, "rewards/rejected": -3.653163194656372, "step": 3150 }, { "epoch": 0.41, "grad_norm": 12.0625, "learning_rate": 3.6471194470172538e-06, "logits/chosen": 0.7748087644577026, "logits/rejected": 1.4111305475234985, "logps/chosen": -550.489990234375, "logps/rejected": -685.2496948242188, "loss": 0.4381, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.574678421020508, "rewards/margins": 1.5018852949142456, "rewards/rejected": -4.076563358306885, "step": 3160 }, { "epoch": 0.41, "grad_norm": 19.25, "learning_rate": 3.636960094282461e-06, "logits/chosen": 0.6883363723754883, "logits/rejected": 1.5032721757888794, "logps/chosen": -598.4791870117188, "logps/rejected": -718.0567626953125, "loss": 0.4243, "rewards/accuracies": 0.8125, "rewards/chosen": -3.152695894241333, "rewards/margins": 1.4088233709335327, "rewards/rejected": -4.561519145965576, "step": 3170 }, { "epoch": 0.42, "grad_norm": 13.375, "learning_rate": 3.6267770143655743e-06, "logits/chosen": 0.7542746067047119, "logits/rejected": 2.28153395652771, "logps/chosen": -606.701171875, "logps/rejected": -643.6585693359375, "loss": 0.6149, "rewards/accuracies": 0.75, "rewards/chosen": -3.396790027618408, "rewards/margins": 1.0983827114105225, "rewards/rejected": -4.49517297744751, "step": 3180 }, { "epoch": 0.42, "grad_norm": 15.6875, "learning_rate": 3.6165704197769484e-06, "logits/chosen": 0.7957227230072021, "logits/rejected": 1.6306054592132568, "logps/chosen": -591.9930419921875, "logps/rejected": -690.844970703125, "loss": 0.4823, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.360551118850708, "rewards/margins": 1.256277322769165, "rewards/rejected": -4.616828441619873, "step": 3190 }, { "epoch": 0.42, "grad_norm": 6.9375, "learning_rate": 3.606340523517663e-06, "logits/chosen": 0.30117177963256836, "logits/rejected": 1.471394419670105, "logps/chosen": -613.8837890625, "logps/rejected": -644.7623291015625, "loss": 0.5977, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.912050724029541, "rewards/margins": 0.9650869369506836, "rewards/rejected": -3.8771374225616455, "step": 3200 }, { "epoch": 0.42, "eval_logits/chosen": 1.2328182458877563, "eval_logits/rejected": 2.0169761180877686, "eval_logps/chosen": -553.0416259765625, "eval_logps/rejected": -642.8284912109375, "eval_loss": 0.4963167607784271, "eval_rewards/accuracies": 0.7425000071525574, "eval_rewards/chosen": -2.8842058181762695, "eval_rewards/margins": 1.0982892513275146, "eval_rewards/rejected": -3.9824953079223633, "eval_runtime": 1591.8278, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 3200 }, { "epoch": 0.42, "grad_norm": 11.5, "learning_rate": 3.5960875390750793e-06, "logits/chosen": 0.5907621383666992, "logits/rejected": 1.5867271423339844, "logps/chosen": -515.3133544921875, "logps/rejected": -595.2828979492188, "loss": 0.4665, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7821831703186035, "rewards/margins": 1.0895627737045288, "rewards/rejected": -3.871746063232422, "step": 3210 }, { "epoch": 0.42, "grad_norm": 14.1875, "learning_rate": 3.585811680418386e-06, "logits/chosen": 0.5541056394577026, "logits/rejected": 1.5352404117584229, "logps/chosen": -594.6611328125, "logps/rejected": -660.6251831054688, "loss": 0.5488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.016439914703369, "rewards/margins": 1.1275527477264404, "rewards/rejected": -4.1439924240112305, "step": 3220 }, { "epoch": 0.42, "grad_norm": 25.125, "learning_rate": 3.5755131619941347e-06, "logits/chosen": 0.729204535484314, "logits/rejected": 1.6516637802124023, "logps/chosen": -512.8201904296875, "logps/rejected": -624.3805541992188, "loss": 0.5408, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7431254386901855, "rewards/margins": 1.1400768756866455, "rewards/rejected": -3.883202314376831, "step": 3230 }, { "epoch": 0.42, "grad_norm": 15.5, "learning_rate": 3.565192198721759e-06, "logits/chosen": 0.37775200605392456, "logits/rejected": 2.0130581855773926, "logps/chosen": -549.7015380859375, "logps/rejected": -609.7913208007812, "loss": 0.4715, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.597252368927002, "rewards/margins": 1.3094682693481445, "rewards/rejected": -3.9067211151123047, "step": 3240 }, { "epoch": 0.43, "grad_norm": 13.5, "learning_rate": 3.5548490059890965e-06, "logits/chosen": 0.22869491577148438, "logits/rejected": 1.187572717666626, "logps/chosen": -550.7482299804688, "logps/rejected": -615.3689575195312, "loss": 0.5063, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6587748527526855, "rewards/margins": 1.0318644046783447, "rewards/rejected": -3.690639019012451, "step": 3250 }, { "epoch": 0.43, "grad_norm": 12.8125, "learning_rate": 3.5444837996478903e-06, "logits/chosen": 0.3918129503726959, "logits/rejected": 0.8749796152114868, "logps/chosen": -483.89794921875, "logps/rejected": -601.3150634765625, "loss": 0.4459, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3012442588806152, "rewards/margins": 1.3543719053268433, "rewards/rejected": -3.655616044998169, "step": 3260 }, { "epoch": 0.43, "grad_norm": 26.125, "learning_rate": 3.534096796009282e-06, "logits/chosen": 0.4137166440486908, "logits/rejected": 1.1227707862854004, "logps/chosen": -560.236083984375, "logps/rejected": -709.2138671875, "loss": 0.5595, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.937579393386841, "rewards/margins": 1.2887606620788574, "rewards/rejected": -4.226339817047119, "step": 3270 }, { "epoch": 0.43, "grad_norm": 8.3125, "learning_rate": 3.5236882118393046e-06, "logits/chosen": 0.8245857357978821, "logits/rejected": 1.0247094631195068, "logps/chosen": -529.5103149414062, "logps/rejected": -669.4229125976562, "loss": 0.4461, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.791597604751587, "rewards/margins": 1.3754332065582275, "rewards/rejected": -4.1670308113098145, "step": 3280 }, { "epoch": 0.43, "grad_norm": 15.875, "learning_rate": 3.5132582643543513e-06, "logits/chosen": -0.049818553030490875, "logits/rejected": 0.3968813717365265, "logps/chosen": -564.7365112304688, "logps/rejected": -670.1153564453125, "loss": 0.5883, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.8896031379699707, "rewards/margins": 1.0605024099349976, "rewards/rejected": -3.950105667114258, "step": 3290 }, { "epoch": 0.43, "grad_norm": 12.25, "learning_rate": 3.5028071712166456e-06, "logits/chosen": -0.1731199026107788, "logits/rejected": 0.6234968900680542, "logps/chosen": -515.9782104492188, "logps/rejected": -608.0538330078125, "loss": 0.5281, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.5141870975494385, "rewards/margins": 1.0168650150299072, "rewards/rejected": -3.5310521125793457, "step": 3300 }, { "epoch": 0.43, "eval_logits/chosen": 0.4294242560863495, "eval_logits/rejected": 1.1826006174087524, "eval_logps/chosen": -507.1646728515625, "eval_logps/rejected": -599.690673828125, "eval_loss": 0.5074065327644348, "eval_rewards/accuracies": 0.7325000166893005, "eval_rewards/chosen": -2.425436019897461, "eval_rewards/margins": 1.1256811618804932, "eval_rewards/rejected": -3.551117181777954, "eval_runtime": 1590.7186, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 3300 }, { "epoch": 0.43, "grad_norm": 11.0625, "learning_rate": 3.4923351505297008e-06, "logits/chosen": 0.23522624373435974, "logits/rejected": 0.05815122276544571, "logps/chosen": -443.71710205078125, "logps/rejected": -581.3840942382812, "loss": 0.5672, "rewards/accuracies": 0.75, "rewards/chosen": -2.257312059402466, "rewards/margins": 0.9799342155456543, "rewards/rejected": -3.23724627494812, "step": 3310 }, { "epoch": 0.43, "grad_norm": 22.5, "learning_rate": 3.481842420833766e-06, "logits/chosen": -0.09912939369678497, "logits/rejected": 0.4761788249015808, "logps/chosen": -439.24090576171875, "logps/rejected": -557.4966430664062, "loss": 0.5156, "rewards/accuracies": 0.75, "rewards/chosen": -2.2463881969451904, "rewards/margins": 1.1034013032913208, "rewards/rejected": -3.34978985786438, "step": 3320 }, { "epoch": 0.44, "grad_norm": 20.75, "learning_rate": 3.4713292011012645e-06, "logits/chosen": -0.19417408108711243, "logits/rejected": 0.4867437481880188, "logps/chosen": -598.3062133789062, "logps/rejected": -665.0530395507812, "loss": 0.5448, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8582839965820312, "rewards/margins": 1.1232712268829346, "rewards/rejected": -3.981555223464966, "step": 3330 }, { "epoch": 0.44, "grad_norm": 36.75, "learning_rate": 3.4607957107322277e-06, "logits/chosen": 0.16228322684764862, "logits/rejected": 0.97930908203125, "logps/chosen": -601.2854614257812, "logps/rejected": -665.2420043945312, "loss": 0.5648, "rewards/accuracies": 0.75, "rewards/chosen": -3.2820160388946533, "rewards/margins": 0.9795142412185669, "rewards/rejected": -4.261530876159668, "step": 3340 }, { "epoch": 0.44, "grad_norm": 7.1875, "learning_rate": 3.4502421695497112e-06, "logits/chosen": 0.13692227005958557, "logits/rejected": 1.0607303380966187, "logps/chosen": -552.9117431640625, "logps/rejected": -664.9490356445312, "loss": 0.4744, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.911181688308716, "rewards/margins": 1.3374974727630615, "rewards/rejected": -4.248679161071777, "step": 3350 }, { "epoch": 0.44, "grad_norm": 10.125, "learning_rate": 3.4396687977952137e-06, "logits/chosen": 0.16164417564868927, "logits/rejected": 0.38801008462905884, "logps/chosen": -572.5013427734375, "logps/rejected": -685.3876342773438, "loss": 0.4979, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.01336407661438, "rewards/margins": 1.1361128091812134, "rewards/rejected": -4.149477481842041, "step": 3360 }, { "epoch": 0.44, "grad_norm": 7.78125, "learning_rate": 3.429075816124075e-06, "logits/chosen": -0.2848863899707794, "logits/rejected": 0.6865512132644653, "logps/chosen": -520.8642578125, "logps/rejected": -632.9857177734375, "loss": 0.3749, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5089519023895264, "rewards/margins": 1.552814245223999, "rewards/rejected": -4.061766147613525, "step": 3370 }, { "epoch": 0.44, "grad_norm": 8.875, "learning_rate": 3.418463445600874e-06, "logits/chosen": 0.7481366991996765, "logits/rejected": 1.7227299213409424, "logps/chosen": -542.5957641601562, "logps/rejected": -732.8658447265625, "loss": 0.387, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3663573265075684, "rewards/margins": 1.6481434106826782, "rewards/rejected": -5.014500617980957, "step": 3380 }, { "epoch": 0.44, "grad_norm": 8.3125, "learning_rate": 3.4078319076948173e-06, "logits/chosen": 0.3249647617340088, "logits/rejected": 0.9782923460006714, "logps/chosen": -604.2789916992188, "logps/rejected": -669.7869873046875, "loss": 0.5856, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1589674949645996, "rewards/margins": 0.9365655779838562, "rewards/rejected": -4.095533847808838, "step": 3390 }, { "epoch": 0.44, "grad_norm": 11.6875, "learning_rate": 3.3971814242751123e-06, "logits/chosen": 0.7070889472961426, "logits/rejected": 1.4743976593017578, "logps/chosen": -574.7918090820312, "logps/rejected": -693.0616455078125, "loss": 0.5114, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.179802417755127, "rewards/margins": 1.2648974657058716, "rewards/rejected": -4.444699764251709, "step": 3400 }, { "epoch": 0.44, "eval_logits/chosen": 1.2128088474273682, "eval_logits/rejected": 2.149277687072754, "eval_logps/chosen": -548.863037109375, "eval_logps/rejected": -652.9094848632812, "eval_loss": 0.5196763277053833, "eval_rewards/accuracies": 0.7254999876022339, "eval_rewards/chosen": -2.8424201011657715, "eval_rewards/margins": 1.2408852577209473, "eval_rewards/rejected": -4.083305835723877, "eval_runtime": 1591.6948, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 3400 }, { "epoch": 0.45, "grad_norm": 18.25, "learning_rate": 3.386512217606339e-06, "logits/chosen": 0.46127867698669434, "logits/rejected": 1.181891679763794, "logps/chosen": -542.9649658203125, "logps/rejected": -631.12451171875, "loss": 0.5582, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.793233633041382, "rewards/margins": 0.9561799764633179, "rewards/rejected": -3.7494137287139893, "step": 3410 }, { "epoch": 0.45, "grad_norm": 16.5, "learning_rate": 3.375824510343816e-06, "logits/chosen": 0.9180284738540649, "logits/rejected": 1.5742998123168945, "logps/chosen": -563.2110595703125, "logps/rejected": -746.85205078125, "loss": 0.3963, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2570576667785645, "rewards/margins": 1.4821968078613281, "rewards/rejected": -4.739253997802734, "step": 3420 }, { "epoch": 0.45, "grad_norm": 18.25, "learning_rate": 3.3651185255289466e-06, "logits/chosen": 0.5649822354316711, "logits/rejected": 1.8119010925292969, "logps/chosen": -595.4826049804688, "logps/rejected": -661.2381591796875, "loss": 0.507, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.104897975921631, "rewards/margins": 1.009965419769287, "rewards/rejected": -4.114863395690918, "step": 3430 }, { "epoch": 0.45, "grad_norm": 22.625, "learning_rate": 3.354394486584568e-06, "logits/chosen": 0.7015897035598755, "logits/rejected": 1.8448234796524048, "logps/chosen": -575.3528442382812, "logps/rejected": -682.8203735351562, "loss": 0.5305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.233482837677002, "rewards/margins": 1.1560642719268799, "rewards/rejected": -4.389547348022461, "step": 3440 }, { "epoch": 0.45, "grad_norm": 17.75, "learning_rate": 3.3436526173102913e-06, "logits/chosen": 1.5318998098373413, "logits/rejected": 2.359741687774658, "logps/chosen": -564.7638549804688, "logps/rejected": -700.0146484375, "loss": 0.4162, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8777577877044678, "rewards/margins": 1.642392873764038, "rewards/rejected": -4.520151138305664, "step": 3450 }, { "epoch": 0.45, "grad_norm": 7.6875, "learning_rate": 3.3328931418778254e-06, "logits/chosen": 1.0667393207550049, "logits/rejected": 1.512853741645813, "logps/chosen": -573.0630493164062, "logps/rejected": -682.3682250976562, "loss": 0.5208, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.0130012035369873, "rewards/margins": 1.110229253768921, "rewards/rejected": -4.123230457305908, "step": 3460 }, { "epoch": 0.45, "grad_norm": 13.875, "learning_rate": 3.3221162848263028e-06, "logits/chosen": 0.6153064966201782, "logits/rejected": 2.2130637168884277, "logps/chosen": -586.8394165039062, "logps/rejected": -665.9385986328125, "loss": 0.502, "rewards/accuracies": 0.75, "rewards/chosen": -2.8460216522216797, "rewards/margins": 1.336303472518921, "rewards/rejected": -4.18232536315918, "step": 3470 }, { "epoch": 0.46, "grad_norm": 17.5, "learning_rate": 3.3113222710575914e-06, "logits/chosen": 1.2034531831741333, "logits/rejected": 2.5035030841827393, "logps/chosen": -584.3963623046875, "logps/rejected": -676.0372314453125, "loss": 0.4895, "rewards/accuracies": 0.75, "rewards/chosen": -2.894857168197632, "rewards/margins": 1.2428711652755737, "rewards/rejected": -4.137728214263916, "step": 3480 }, { "epoch": 0.46, "grad_norm": 21.625, "learning_rate": 3.300511325831603e-06, "logits/chosen": 0.7528663873672485, "logits/rejected": 1.6426197290420532, "logps/chosen": -545.1719360351562, "logps/rejected": -673.9959716796875, "loss": 0.4629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.682860851287842, "rewards/margins": 1.4254639148712158, "rewards/rejected": -4.1083245277404785, "step": 3490 }, { "epoch": 0.46, "grad_norm": 14.75, "learning_rate": 3.289683674761592e-06, "logits/chosen": 1.3696619272232056, "logits/rejected": 2.1932106018066406, "logps/chosen": -555.2965087890625, "logps/rejected": -658.5980834960938, "loss": 0.4984, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1395516395568848, "rewards/margins": 1.1953315734863281, "rewards/rejected": -4.334883213043213, "step": 3500 }, { "epoch": 0.46, "eval_logits/chosen": 2.4203193187713623, "eval_logits/rejected": 3.3501927852630615, "eval_logps/chosen": -584.5863647460938, "eval_logps/rejected": -686.7951049804688, "eval_loss": 0.500175416469574, "eval_rewards/accuracies": 0.7450000047683716, "eval_rewards/chosen": -3.199653148651123, "eval_rewards/margins": 1.222508430480957, "eval_rewards/rejected": -4.42216157913208, "eval_runtime": 1591.0902, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 3500 }, { "epoch": 0.46, "grad_norm": 10.1875, "learning_rate": 3.2788395438094444e-06, "logits/chosen": 1.513866662979126, "logits/rejected": 2.8263657093048096, "logps/chosen": -604.82861328125, "logps/rejected": -735.3837890625, "loss": 0.4415, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.161881923675537, "rewards/margins": 1.5348942279815674, "rewards/rejected": -4.696776390075684, "step": 3510 }, { "epoch": 0.46, "grad_norm": 12.3125, "learning_rate": 3.2679791592809653e-06, "logits/chosen": 1.3661248683929443, "logits/rejected": 2.9377036094665527, "logps/chosen": -630.15966796875, "logps/rejected": -705.7320556640625, "loss": 0.4701, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.424785614013672, "rewards/margins": 1.3289605379104614, "rewards/rejected": -4.753746032714844, "step": 3520 }, { "epoch": 0.46, "grad_norm": 20.75, "learning_rate": 3.257102747821157e-06, "logits/chosen": 1.612074851989746, "logits/rejected": 2.272280216217041, "logps/chosen": -618.0646362304688, "logps/rejected": -717.951171875, "loss": 0.5457, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.4815406799316406, "rewards/margins": 1.1168197393417358, "rewards/rejected": -4.598360538482666, "step": 3530 }, { "epoch": 0.46, "grad_norm": 5.90625, "learning_rate": 3.246210536409484e-06, "logits/chosen": 0.9772621989250183, "logits/rejected": 2.2998509407043457, "logps/chosen": -603.9741821289062, "logps/rejected": -677.609130859375, "loss": 0.4589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1230061054229736, "rewards/margins": 1.463616132736206, "rewards/rejected": -4.586622714996338, "step": 3540 }, { "epoch": 0.46, "grad_norm": 10.5625, "learning_rate": 3.235302752355142e-06, "logits/chosen": 1.1950008869171143, "logits/rejected": 2.352081775665283, "logps/chosen": -600.2168579101562, "logps/rejected": -651.6273803710938, "loss": 0.5152, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2299892902374268, "rewards/margins": 1.007690191268921, "rewards/rejected": -4.237679481506348, "step": 3550 }, { "epoch": 0.47, "grad_norm": 17.25, "learning_rate": 3.2243796232923097e-06, "logits/chosen": 1.0240675210952759, "logits/rejected": 2.123300790786743, "logps/chosen": -528.0433349609375, "logps/rejected": -596.7354736328125, "loss": 0.4909, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9449267387390137, "rewards/margins": 1.1938796043395996, "rewards/rejected": -4.138806343078613, "step": 3560 }, { "epoch": 0.47, "grad_norm": 13.0, "learning_rate": 3.2134413771754037e-06, "logits/chosen": 1.0020049810409546, "logits/rejected": 2.169445037841797, "logps/chosen": -583.1009521484375, "logps/rejected": -682.7677612304688, "loss": 0.4336, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9823849201202393, "rewards/margins": 1.2904002666473389, "rewards/rejected": -4.272785186767578, "step": 3570 }, { "epoch": 0.47, "grad_norm": 8.625, "learning_rate": 3.2024882422743118e-06, "logits/chosen": 1.328412652015686, "logits/rejected": 2.1645195484161377, "logps/chosen": -493.51678466796875, "logps/rejected": -633.2418212890625, "loss": 0.3906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7480642795562744, "rewards/margins": 1.3832073211669922, "rewards/rejected": -4.1312713623046875, "step": 3580 }, { "epoch": 0.47, "grad_norm": 12.3125, "learning_rate": 3.1915204471696425e-06, "logits/chosen": 1.0298570394515991, "logits/rejected": 2.2715506553649902, "logps/chosen": -526.4378662109375, "logps/rejected": -669.9403076171875, "loss": 0.4011, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6348226070404053, "rewards/margins": 1.5454890727996826, "rewards/rejected": -4.180312156677246, "step": 3590 }, { "epoch": 0.47, "grad_norm": 7.3125, "learning_rate": 3.180538220747943e-06, "logits/chosen": 1.3050510883331299, "logits/rejected": 1.3333051204681396, "logps/chosen": -555.6622924804688, "logps/rejected": -714.8551635742188, "loss": 0.5723, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.9709713459014893, "rewards/margins": 1.0748671293258667, "rewards/rejected": -4.045838832855225, "step": 3600 }, { "epoch": 0.47, "eval_logits/chosen": 2.2597780227661133, "eval_logits/rejected": 3.1533713340759277, "eval_logps/chosen": -565.27490234375, "eval_logps/rejected": -668.9721069335938, "eval_loss": 0.5010019540786743, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -3.006538152694702, "eval_rewards/margins": 1.2373936176300049, "eval_rewards/rejected": -4.243931770324707, "eval_runtime": 1613.3695, "eval_samples_per_second": 1.24, "eval_steps_per_second": 0.31, "step": 3600 }, { "epoch": 0.47, "grad_norm": 16.125, "learning_rate": 3.1695417921969287e-06, "logits/chosen": 1.2470731735229492, "logits/rejected": 2.3240132331848145, "logps/chosen": -545.6837768554688, "logps/rejected": -646.6546020507812, "loss": 0.4794, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9675731658935547, "rewards/margins": 1.2627222537994385, "rewards/rejected": -4.230295181274414, "step": 3610 }, { "epoch": 0.47, "grad_norm": 14.0, "learning_rate": 3.158531391000697e-06, "logits/chosen": 0.9274897575378418, "logits/rejected": 1.4380440711975098, "logps/chosen": -529.5089721679688, "logps/rejected": -630.5634155273438, "loss": 0.6304, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.034899950027466, "rewards/margins": 0.9231551289558411, "rewards/rejected": -3.958055019378662, "step": 3620 }, { "epoch": 0.48, "grad_norm": 33.25, "learning_rate": 3.147507246934943e-06, "logits/chosen": 0.8136155009269714, "logits/rejected": 1.3876383304595947, "logps/chosen": -548.38232421875, "logps/rejected": -674.1826782226562, "loss": 0.4906, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.774062395095825, "rewards/margins": 1.34238600730896, "rewards/rejected": -4.116448402404785, "step": 3630 }, { "epoch": 0.48, "grad_norm": 14.875, "learning_rate": 3.136469590062158e-06, "logits/chosen": 0.4745601713657379, "logits/rejected": 1.5338537693023682, "logps/chosen": -516.0549926757812, "logps/rejected": -564.0617065429688, "loss": 0.4777, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.353170156478882, "rewards/margins": 1.0227165222167969, "rewards/rejected": -3.3758864402770996, "step": 3640 }, { "epoch": 0.48, "grad_norm": 17.0, "learning_rate": 3.1254186507268354e-06, "logits/chosen": 0.6987334489822388, "logits/rejected": 1.4793400764465332, "logps/chosen": -501.6717224121094, "logps/rejected": -627.906982421875, "loss": 0.3542, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.213365077972412, "rewards/margins": 1.541622281074524, "rewards/rejected": -3.7549872398376465, "step": 3650 }, { "epoch": 0.48, "grad_norm": 20.375, "learning_rate": 3.114354659550656e-06, "logits/chosen": 0.7804635167121887, "logits/rejected": 2.090153217315674, "logps/chosen": -529.9520263671875, "logps/rejected": -633.1751098632812, "loss": 0.4671, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8481204509735107, "rewards/margins": 1.3539659976959229, "rewards/rejected": -4.202086448669434, "step": 3660 }, { "epoch": 0.48, "grad_norm": 15.75, "learning_rate": 3.1032778474276816e-06, "logits/chosen": 1.4736740589141846, "logits/rejected": 2.5067548751831055, "logps/chosen": -538.311767578125, "logps/rejected": -713.0725708007812, "loss": 0.4175, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9481959342956543, "rewards/margins": 1.6900793313980103, "rewards/rejected": -4.638275146484375, "step": 3670 }, { "epoch": 0.48, "grad_norm": 6.46875, "learning_rate": 3.092188445519532e-06, "logits/chosen": 1.6656243801116943, "logits/rejected": 3.1111550331115723, "logps/chosen": -513.4066162109375, "logps/rejected": -629.7359619140625, "loss": 0.4822, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9789786338806152, "rewards/margins": 1.4785501956939697, "rewards/rejected": -4.457529544830322, "step": 3680 }, { "epoch": 0.48, "grad_norm": 12.9375, "learning_rate": 3.081086685250565e-06, "logits/chosen": 1.1992745399475098, "logits/rejected": 2.9142909049987793, "logps/chosen": -667.6442260742188, "logps/rejected": -707.6119384765625, "loss": 0.4743, "rewards/accuracies": 0.75, "rewards/chosen": -3.285987138748169, "rewards/margins": 1.1857128143310547, "rewards/rejected": -4.4716997146606445, "step": 3690 }, { "epoch": 0.48, "grad_norm": 16.25, "learning_rate": 3.0699727983030434e-06, "logits/chosen": 1.3589823246002197, "logits/rejected": 2.0958337783813477, "logps/chosen": -599.2689208984375, "logps/rejected": -723.6403198242188, "loss": 0.5496, "rewards/accuracies": 0.75, "rewards/chosen": -3.401282787322998, "rewards/margins": 1.1826626062393188, "rewards/rejected": -4.5839457511901855, "step": 3700 }, { "epoch": 0.48, "eval_logits/chosen": 2.4471547603607178, "eval_logits/rejected": 3.31201171875, "eval_logps/chosen": -570.4303588867188, "eval_logps/rejected": -677.9390869140625, "eval_loss": 0.5015013813972473, "eval_rewards/accuracies": 0.7394999861717224, "eval_rewards/chosen": -3.058093786239624, "eval_rewards/margins": 1.2755075693130493, "eval_rewards/rejected": -4.333600997924805, "eval_runtime": 1591.8744, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 3700 }, { "epoch": 0.49, "grad_norm": 23.0, "learning_rate": 3.058847016612301e-06, "logits/chosen": 1.3370041847229004, "logits/rejected": 2.4886152744293213, "logps/chosen": -585.2352294921875, "logps/rejected": -648.9256591796875, "loss": 0.5683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.335144519805908, "rewards/margins": 1.0732650756835938, "rewards/rejected": -4.40841007232666, "step": 3710 }, { "epoch": 0.49, "grad_norm": 11.375, "learning_rate": 3.0477095723619034e-06, "logits/chosen": 1.6717097759246826, "logits/rejected": 2.2256245613098145, "logps/chosen": -526.5162963867188, "logps/rejected": -641.79638671875, "loss": 0.4822, "rewards/accuracies": 0.75, "rewards/chosen": -2.7065768241882324, "rewards/margins": 1.269716501235962, "rewards/rejected": -3.9762935638427734, "step": 3720 }, { "epoch": 0.49, "grad_norm": 20.5, "learning_rate": 3.0365606979788003e-06, "logits/chosen": 1.271213173866272, "logits/rejected": 2.268070936203003, "logps/chosen": -607.2047119140625, "logps/rejected": -735.3359375, "loss": 0.4284, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.146824836730957, "rewards/margins": 1.4212199449539185, "rewards/rejected": -4.568044662475586, "step": 3730 }, { "epoch": 0.49, "grad_norm": 15.9375, "learning_rate": 3.0254006261284786e-06, "logits/chosen": 1.67838454246521, "logits/rejected": 2.5358495712280273, "logps/chosen": -625.5853881835938, "logps/rejected": -768.9443969726562, "loss": 0.4578, "rewards/accuracies": 0.75, "rewards/chosen": -3.5043373107910156, "rewards/margins": 1.5824673175811768, "rewards/rejected": -5.086804389953613, "step": 3740 }, { "epoch": 0.49, "grad_norm": 17.625, "learning_rate": 3.0142295897101032e-06, "logits/chosen": 1.197113275527954, "logits/rejected": 1.7569679021835327, "logps/chosen": -588.1669311523438, "logps/rejected": -715.07568359375, "loss": 0.4599, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.412364959716797, "rewards/margins": 1.3579638004302979, "rewards/rejected": -4.770328521728516, "step": 3750 }, { "epoch": 0.49, "grad_norm": 8.25, "learning_rate": 3.0030478218516578e-06, "logits/chosen": 0.9988697171211243, "logits/rejected": 2.3596878051757812, "logps/chosen": -614.2236328125, "logps/rejected": -727.8824462890625, "loss": 0.4997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.321038007736206, "rewards/margins": 1.4818624258041382, "rewards/rejected": -4.802900791168213, "step": 3760 }, { "epoch": 0.49, "grad_norm": 6.4375, "learning_rate": 2.9918555559050826e-06, "logits/chosen": 0.6846061944961548, "logits/rejected": 1.5343295335769653, "logps/chosen": -570.2298583984375, "logps/rejected": -632.1084594726562, "loss": 0.5979, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.1014671325683594, "rewards/margins": 1.0553886890411377, "rewards/rejected": -4.156855583190918, "step": 3770 }, { "epoch": 0.49, "grad_norm": 15.3125, "learning_rate": 2.980653025441399e-06, "logits/chosen": 1.6924612522125244, "logits/rejected": 2.326639175415039, "logps/chosen": -585.7066650390625, "logps/rejected": -714.9122924804688, "loss": 0.4739, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2248730659484863, "rewards/margins": 1.3962581157684326, "rewards/rejected": -4.621131420135498, "step": 3780 }, { "epoch": 0.5, "grad_norm": 5.8125, "learning_rate": 2.969440464245841e-06, "logits/chosen": 1.1217143535614014, "logits/rejected": 2.3194773197174072, "logps/chosen": -560.1303100585938, "logps/rejected": -688.375732421875, "loss": 0.5052, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2385811805725098, "rewards/margins": 1.344367504119873, "rewards/rejected": -4.582947731018066, "step": 3790 }, { "epoch": 0.5, "grad_norm": 23.0, "learning_rate": 2.95821810631297e-06, "logits/chosen": 1.2779474258422852, "logits/rejected": 2.844780445098877, "logps/chosen": -663.8018798828125, "logps/rejected": -746.4864501953125, "loss": 0.5106, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.8018813133239746, "rewards/margins": 1.2830824851989746, "rewards/rejected": -5.084963798522949, "step": 3800 }, { "epoch": 0.5, "eval_logits/chosen": 1.8546652793884277, "eval_logits/rejected": 2.713374614715576, "eval_logps/chosen": -615.3915405273438, "eval_logps/rejected": -726.6728515625, "eval_loss": 0.5012689232826233, "eval_rewards/accuracies": 0.7394999861717224, "eval_rewards/chosen": -3.507704973220825, "eval_rewards/margins": 1.3132338523864746, "eval_rewards/rejected": -4.820939064025879, "eval_runtime": 1592.0058, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 3800 }, { "epoch": 0.5, "grad_norm": 31.625, "learning_rate": 2.946986185841801e-06, "logits/chosen": 0.9378841519355774, "logits/rejected": 1.745422124862671, "logps/chosen": -627.324462890625, "logps/rejected": -722.0845947265625, "loss": 0.4964, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.6624836921691895, "rewards/margins": 1.261169672012329, "rewards/rejected": -4.923653602600098, "step": 3810 }, { "epoch": 0.5, "grad_norm": 12.75, "learning_rate": 2.935744937230903e-06, "logits/chosen": 1.396041750907898, "logits/rejected": 1.7260946035385132, "logps/chosen": -622.05126953125, "logps/rejected": -763.8098754882812, "loss": 0.568, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.608628749847412, "rewards/margins": 1.256679892539978, "rewards/rejected": -4.8653082847595215, "step": 3820 }, { "epoch": 0.5, "grad_norm": 8.8125, "learning_rate": 2.924494595073517e-06, "logits/chosen": 0.6841916441917419, "logits/rejected": 1.274814248085022, "logps/chosen": -572.808349609375, "logps/rejected": -694.9969482421875, "loss": 0.482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.484985828399658, "rewards/margins": 1.2095296382904053, "rewards/rejected": -4.694515228271484, "step": 3830 }, { "epoch": 0.5, "grad_norm": 13.1875, "learning_rate": 2.9132353941526575e-06, "logits/chosen": 0.7297332286834717, "logits/rejected": 1.7951176166534424, "logps/chosen": -656.1531372070312, "logps/rejected": -790.7908325195312, "loss": 0.4268, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.7499594688415527, "rewards/margins": 1.6107057332992554, "rewards/rejected": -5.360665321350098, "step": 3840 }, { "epoch": 0.5, "grad_norm": 16.25, "learning_rate": 2.901967569436209e-06, "logits/chosen": 0.5449053049087524, "logits/rejected": 1.2905735969543457, "logps/chosen": -650.99853515625, "logps/rejected": -721.5765380859375, "loss": 0.543, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8416759967803955, "rewards/margins": 1.0251166820526123, "rewards/rejected": -4.86679220199585, "step": 3850 }, { "epoch": 0.51, "grad_norm": 39.0, "learning_rate": 2.89069135607203e-06, "logits/chosen": 0.5232194662094116, "logits/rejected": 1.2776695489883423, "logps/chosen": -597.3203735351562, "logps/rejected": -697.5950927734375, "loss": 0.5231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.524958848953247, "rewards/margins": 1.080949306488037, "rewards/rejected": -4.605908393859863, "step": 3860 }, { "epoch": 0.51, "grad_norm": 15.6875, "learning_rate": 2.8794069893830386e-06, "logits/chosen": 0.9736347198486328, "logits/rejected": 2.3645083904266357, "logps/chosen": -596.4930419921875, "logps/rejected": -694.7677001953125, "loss": 0.5365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4399502277374268, "rewards/margins": 1.128644347190857, "rewards/rejected": -4.568594932556152, "step": 3870 }, { "epoch": 0.51, "grad_norm": 8.4375, "learning_rate": 2.8681147048623038e-06, "logits/chosen": 0.4126865863800049, "logits/rejected": 1.3872052431106567, "logps/chosen": -606.4537353515625, "logps/rejected": -725.9848022460938, "loss": 0.405, "rewards/accuracies": 0.8125, "rewards/chosen": -2.983593225479126, "rewards/margins": 1.3947454690933228, "rewards/rejected": -4.378338813781738, "step": 3880 }, { "epoch": 0.51, "grad_norm": 27.5, "learning_rate": 2.8568147381681333e-06, "logits/chosen": 0.9017072916030884, "logits/rejected": 1.5411750078201294, "logps/chosen": -546.4932250976562, "logps/rejected": -644.0993041992188, "loss": 0.6068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.0471713542938232, "rewards/margins": 1.2161060571670532, "rewards/rejected": -4.263277530670166, "step": 3890 }, { "epoch": 0.51, "grad_norm": 11.5, "learning_rate": 2.8455073251191533e-06, "logits/chosen": 0.5226460695266724, "logits/rejected": 1.6768258810043335, "logps/chosen": -612.2659912109375, "logps/rejected": -740.1510620117188, "loss": 0.376, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.243638515472412, "rewards/margins": 1.695278525352478, "rewards/rejected": -4.938917636871338, "step": 3900 }, { "epoch": 0.51, "eval_logits/chosen": 1.9628366231918335, "eval_logits/rejected": 2.773855447769165, "eval_logps/chosen": -590.9802856445312, "eval_logps/rejected": -697.17529296875, "eval_loss": 0.4994959235191345, "eval_rewards/accuracies": 0.7415000200271606, "eval_rewards/chosen": -3.263592481613159, "eval_rewards/margins": 1.2623705863952637, "eval_rewards/rejected": -4.525962829589844, "eval_runtime": 1593.0408, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.314, "step": 3900 }, { "epoch": 0.51, "grad_norm": 8.625, "learning_rate": 2.8341927016893887e-06, "logits/chosen": 1.4300041198730469, "logits/rejected": 1.2578006982803345, "logps/chosen": -591.4417114257812, "logps/rejected": -711.3917236328125, "loss": 0.4788, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3117687702178955, "rewards/margins": 1.2131303548812866, "rewards/rejected": -4.524899482727051, "step": 3910 }, { "epoch": 0.51, "grad_norm": 14.0, "learning_rate": 2.822871104003335e-06, "logits/chosen": 0.9081200361251831, "logits/rejected": 2.4838366508483887, "logps/chosen": -589.5599365234375, "logps/rejected": -662.2095947265625, "loss": 0.4594, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.023688316345215, "rewards/margins": 1.5029232501983643, "rewards/rejected": -4.526611328125, "step": 3920 }, { "epoch": 0.51, "grad_norm": 12.875, "learning_rate": 2.8115427683310355e-06, "logits/chosen": 0.47768887877464294, "logits/rejected": 1.4875624179840088, "logps/chosen": -587.0402221679688, "logps/rejected": -677.3700561523438, "loss": 0.5306, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.184080123901367, "rewards/margins": 1.084136962890625, "rewards/rejected": -4.268217086791992, "step": 3930 }, { "epoch": 0.52, "grad_norm": 30.875, "learning_rate": 2.8002079310831477e-06, "logits/chosen": 0.6028070449829102, "logits/rejected": 1.0062382221221924, "logps/chosen": -555.0243530273438, "logps/rejected": -614.6267700195312, "loss": 0.65, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.950934410095215, "rewards/margins": 0.8431466817855835, "rewards/rejected": -3.794081211090088, "step": 3940 }, { "epoch": 0.52, "grad_norm": 5.6875, "learning_rate": 2.7888668288060095e-06, "logits/chosen": 0.39542144536972046, "logits/rejected": 1.2938110828399658, "logps/chosen": -533.0115356445312, "logps/rejected": -613.7713623046875, "loss": 0.5404, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.655400514602661, "rewards/margins": 1.038409948348999, "rewards/rejected": -3.6938107013702393, "step": 3950 }, { "epoch": 0.52, "grad_norm": 12.75, "learning_rate": 2.7775196981767044e-06, "logits/chosen": 0.7035880088806152, "logits/rejected": 1.4570562839508057, "logps/chosen": -535.3646240234375, "logps/rejected": -627.9224853515625, "loss": 0.5444, "rewards/accuracies": 0.75, "rewards/chosen": -2.855009078979492, "rewards/margins": 1.1241127252578735, "rewards/rejected": -3.9791221618652344, "step": 3960 }, { "epoch": 0.52, "grad_norm": 20.0, "learning_rate": 2.7661667759981213e-06, "logits/chosen": 0.7251302003860474, "logits/rejected": 1.8517974615097046, "logps/chosen": -567.191162109375, "logps/rejected": -618.9002685546875, "loss": 0.6056, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1272902488708496, "rewards/margins": 0.8452841639518738, "rewards/rejected": -3.9725747108459473, "step": 3970 }, { "epoch": 0.52, "grad_norm": 38.5, "learning_rate": 2.7548082991940137e-06, "logits/chosen": 0.8394457101821899, "logits/rejected": 1.7973182201385498, "logps/chosen": -553.3853149414062, "logps/rejected": -612.0332641601562, "loss": 0.6223, "rewards/accuracies": 0.75, "rewards/chosen": -3.0196127891540527, "rewards/margins": 0.9670158624649048, "rewards/rejected": -3.986628770828247, "step": 3980 }, { "epoch": 0.52, "grad_norm": 9.4375, "learning_rate": 2.743444504804051e-06, "logits/chosen": 0.6090790033340454, "logits/rejected": 1.5463039875030518, "logps/chosen": -559.8428955078125, "logps/rejected": -631.6338500976562, "loss": 0.4786, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7995877265930176, "rewards/margins": 1.0208765268325806, "rewards/rejected": -3.8204643726348877, "step": 3990 }, { "epoch": 0.52, "grad_norm": 17.125, "learning_rate": 2.7320756299788788e-06, "logits/chosen": 0.40987616777420044, "logits/rejected": 1.202165961265564, "logps/chosen": -534.3167724609375, "logps/rejected": -659.1214599609375, "loss": 0.4935, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.807413101196289, "rewards/margins": 1.016972303390503, "rewards/rejected": -3.824385404586792, "step": 4000 }, { "epoch": 0.52, "eval_logits/chosen": 1.5516222715377808, "eval_logits/rejected": 2.2899205684661865, "eval_logps/chosen": -547.131103515625, "eval_logps/rejected": -640.8605346679688, "eval_loss": 0.4915643334388733, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -2.825101137161255, "eval_rewards/margins": 1.137715458869934, "eval_rewards/rejected": -3.9628164768218994, "eval_runtime": 1592.3492, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 4000 }, { "epoch": 0.52, "grad_norm": 13.125, "learning_rate": 2.7207019119751644e-06, "logits/chosen": 0.3096240162849426, "logits/rejected": 1.2950459718704224, "logps/chosen": -519.6134643554688, "logps/rejected": -585.9755859375, "loss": 0.4758, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.6585440635681152, "rewards/margins": 1.1354074478149414, "rewards/rejected": -3.7939515113830566, "step": 4010 }, { "epoch": 0.53, "grad_norm": 33.25, "learning_rate": 2.7093235881506474e-06, "logits/chosen": 1.2475144863128662, "logits/rejected": 1.9860883951187134, "logps/chosen": -569.3560791015625, "logps/rejected": -669.8253784179688, "loss": 0.4712, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9248297214508057, "rewards/margins": 1.4279091358184814, "rewards/rejected": -4.352738857269287, "step": 4020 }, { "epoch": 0.53, "grad_norm": 14.375, "learning_rate": 2.6979408959591863e-06, "logits/chosen": 0.5688766241073608, "logits/rejected": 0.8400389552116394, "logps/chosen": -561.5079956054688, "logps/rejected": -659.046875, "loss": 0.6009, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.01423716545105, "rewards/margins": 0.922036349773407, "rewards/rejected": -3.9362735748291016, "step": 4030 }, { "epoch": 0.53, "grad_norm": 17.375, "learning_rate": 2.6865540729458034e-06, "logits/chosen": 0.10123654454946518, "logits/rejected": 0.8616682887077332, "logps/chosen": -576.2913208007812, "logps/rejected": -641.7877807617188, "loss": 0.5364, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.760798931121826, "rewards/margins": 1.0089768171310425, "rewards/rejected": -3.769775867462158, "step": 4040 }, { "epoch": 0.53, "grad_norm": 10.4375, "learning_rate": 2.675163356741726e-06, "logits/chosen": 0.42007890343666077, "logits/rejected": 1.0813418626785278, "logps/chosen": -470.1661682128906, "logps/rejected": -569.1080932617188, "loss": 0.4399, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5839219093322754, "rewards/margins": 1.1666027307510376, "rewards/rejected": -3.7505240440368652, "step": 4050 }, { "epoch": 0.53, "grad_norm": 8.625, "learning_rate": 2.6637689850594285e-06, "logits/chosen": 0.4460233747959137, "logits/rejected": 1.1718910932540894, "logps/chosen": -530.4000854492188, "logps/rejected": -595.0211181640625, "loss": 0.5224, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.648142099380493, "rewards/margins": 0.9859753847122192, "rewards/rejected": -3.634117603302002, "step": 4060 }, { "epoch": 0.53, "grad_norm": 14.5, "learning_rate": 2.652371195687671e-06, "logits/chosen": 0.7797343730926514, "logits/rejected": 1.4408290386199951, "logps/chosen": -500.3291015625, "logps/rejected": -628.8976440429688, "loss": 0.4065, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4375433921813965, "rewards/margins": 1.5079452991485596, "rewards/rejected": -3.945488691329956, "step": 4070 }, { "epoch": 0.53, "grad_norm": 33.5, "learning_rate": 2.64097022648654e-06, "logits/chosen": 0.49426335096359253, "logits/rejected": 1.7455838918685913, "logps/chosen": -594.4205932617188, "logps/rejected": -676.2381591796875, "loss": 0.5492, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9470136165618896, "rewards/margins": 1.3631705045700073, "rewards/rejected": -4.310183525085449, "step": 4080 }, { "epoch": 0.54, "grad_norm": 12.6875, "learning_rate": 2.6295663153824774e-06, "logits/chosen": 0.4955861568450928, "logits/rejected": 1.726845145225525, "logps/chosen": -572.8784790039062, "logps/rejected": -633.9725952148438, "loss": 0.5038, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8294196128845215, "rewards/margins": 1.260789394378662, "rewards/rejected": -4.090209007263184, "step": 4090 }, { "epoch": 0.54, "grad_norm": 12.5625, "learning_rate": 2.6181597003633218e-06, "logits/chosen": 0.7141835689544678, "logits/rejected": 2.0009679794311523, "logps/chosen": -566.5924072265625, "logps/rejected": -663.8424072265625, "loss": 0.445, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8427367210388184, "rewards/margins": 1.4427145719528198, "rewards/rejected": -4.2854509353637695, "step": 4100 }, { "epoch": 0.54, "eval_logits/chosen": 1.8263081312179565, "eval_logits/rejected": 2.594874858856201, "eval_logps/chosen": -577.61767578125, "eval_logps/rejected": -685.20458984375, "eval_loss": 0.4958656132221222, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -3.1299664974212646, "eval_rewards/margins": 1.276289939880371, "eval_rewards/rejected": -4.406256198883057, "eval_runtime": 1592.5081, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 4100 }, { "epoch": 0.54, "grad_norm": 8.6875, "learning_rate": 2.606750619473342e-06, "logits/chosen": 1.2048364877700806, "logits/rejected": 1.8250070810317993, "logps/chosen": -559.253662109375, "logps/rejected": -670.3594970703125, "loss": 0.465, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.046375036239624, "rewards/margins": 1.1393458843231201, "rewards/rejected": -4.185720920562744, "step": 4110 }, { "epoch": 0.54, "grad_norm": 19.25, "learning_rate": 2.595339310808262e-06, "logits/chosen": 0.7069088220596313, "logits/rejected": 1.2920167446136475, "logps/chosen": -541.710205078125, "logps/rejected": -664.6094360351562, "loss": 0.3956, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.00246524810791, "rewards/margins": 1.356275200843811, "rewards/rejected": -4.358740329742432, "step": 4120 }, { "epoch": 0.54, "grad_norm": 26.125, "learning_rate": 2.5839260125103004e-06, "logits/chosen": 0.9028911590576172, "logits/rejected": 1.8872220516204834, "logps/chosen": -589.1178588867188, "logps/rejected": -703.8515625, "loss": 0.428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1477065086364746, "rewards/margins": 1.3748151063919067, "rewards/rejected": -4.522521495819092, "step": 4130 }, { "epoch": 0.54, "grad_norm": 24.25, "learning_rate": 2.5725109627631984e-06, "logits/chosen": 0.8007787466049194, "logits/rejected": 1.5194114446640015, "logps/chosen": -595.8224487304688, "logps/rejected": -692.4710693359375, "loss": 0.4624, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.188469409942627, "rewards/margins": 1.274657964706421, "rewards/rejected": -4.463127613067627, "step": 4140 }, { "epoch": 0.54, "grad_norm": 47.5, "learning_rate": 2.5610943997872443e-06, "logits/chosen": 1.1802794933319092, "logits/rejected": 1.5942678451538086, "logps/chosen": -628.5667114257812, "logps/rejected": -740.0662231445312, "loss": 0.5069, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.7526698112487793, "rewards/margins": 1.1907706260681152, "rewards/rejected": -4.9434404373168945, "step": 4150 }, { "epoch": 0.54, "grad_norm": 22.125, "learning_rate": 2.5496765618343096e-06, "logits/chosen": 0.9174972772598267, "logits/rejected": 1.2704370021820068, "logps/chosen": -620.0501098632812, "logps/rejected": -710.2423095703125, "loss": 0.6713, "rewards/accuracies": 0.625, "rewards/chosen": -3.6227316856384277, "rewards/margins": 0.9411904215812683, "rewards/rejected": -4.563921928405762, "step": 4160 }, { "epoch": 0.55, "grad_norm": 13.5, "learning_rate": 2.538257687182871e-06, "logits/chosen": 1.4342944622039795, "logits/rejected": 2.462162494659424, "logps/chosen": -576.3062744140625, "logps/rejected": -697.0335693359375, "loss": 0.5501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4480528831481934, "rewards/margins": 1.26388680934906, "rewards/rejected": -4.711940288543701, "step": 4170 }, { "epoch": 0.55, "grad_norm": 32.5, "learning_rate": 2.526838014133041e-06, "logits/chosen": 1.0441315174102783, "logits/rejected": 1.7808837890625, "logps/chosen": -587.8073120117188, "logps/rejected": -674.4149169921875, "loss": 0.5703, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.2182915210723877, "rewards/margins": 1.070989966392517, "rewards/rejected": -4.289281368255615, "step": 4180 }, { "epoch": 0.55, "grad_norm": 10.25, "learning_rate": 2.515417781001594e-06, "logits/chosen": 0.7826089859008789, "logits/rejected": 1.5784668922424316, "logps/chosen": -499.166015625, "logps/rejected": -613.2916259765625, "loss": 0.4251, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5489115715026855, "rewards/margins": 1.3632811307907104, "rewards/rejected": -3.9121928215026855, "step": 4190 }, { "epoch": 0.55, "grad_norm": 8.9375, "learning_rate": 2.503997226116992e-06, "logits/chosen": 1.1384754180908203, "logits/rejected": 1.1739859580993652, "logps/chosen": -487.3101501464844, "logps/rejected": -636.6060180664062, "loss": 0.443, "rewards/accuracies": 0.75, "rewards/chosen": -2.4144322872161865, "rewards/margins": 1.3946157693862915, "rewards/rejected": -3.8090476989746094, "step": 4200 }, { "epoch": 0.55, "eval_logits/chosen": 1.7637161016464233, "eval_logits/rejected": 2.5643436908721924, "eval_logps/chosen": -525.6652221679688, "eval_logps/rejected": -636.2509765625, "eval_loss": 0.5038745999336243, "eval_rewards/accuracies": 0.734499990940094, "eval_rewards/chosen": -2.6104423999786377, "eval_rewards/margins": 1.306278109550476, "eval_rewards/rejected": -3.916720390319824, "eval_runtime": 1591.7784, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 4200 }, { "epoch": 0.55, "grad_norm": 13.3125, "learning_rate": 2.4925765878144115e-06, "logits/chosen": 1.3933148384094238, "logits/rejected": 2.002098560333252, "logps/chosen": -489.69842529296875, "logps/rejected": -591.0333862304688, "loss": 0.5167, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4099645614624023, "rewards/margins": 1.2759312391281128, "rewards/rejected": -3.6858959197998047, "step": 4210 }, { "epoch": 0.55, "grad_norm": 23.375, "learning_rate": 2.4811561044307727e-06, "logits/chosen": 0.9644983410835266, "logits/rejected": 1.7500193119049072, "logps/chosen": -487.23187255859375, "logps/rejected": -627.9241943359375, "loss": 0.4398, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5000617504119873, "rewards/margins": 1.471921682357788, "rewards/rejected": -3.9719836711883545, "step": 4220 }, { "epoch": 0.55, "grad_norm": 16.125, "learning_rate": 2.469736014299758e-06, "logits/chosen": 0.8705085515975952, "logits/rejected": 1.7094383239746094, "logps/chosen": -528.7936401367188, "logps/rejected": -636.2086181640625, "loss": 0.411, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6657662391662598, "rewards/margins": 1.3999571800231934, "rewards/rejected": -4.065723419189453, "step": 4230 }, { "epoch": 0.55, "grad_norm": 17.75, "learning_rate": 2.458316555746846e-06, "logits/chosen": 1.1141247749328613, "logits/rejected": 1.7977077960968018, "logps/chosen": -582.32958984375, "logps/rejected": -719.8034057617188, "loss": 0.5544, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.231219530105591, "rewards/margins": 1.3708423376083374, "rewards/rejected": -4.6020612716674805, "step": 4240 }, { "epoch": 0.56, "grad_norm": 14.875, "learning_rate": 2.446897967084334e-06, "logits/chosen": 1.4410032033920288, "logits/rejected": 2.145479679107666, "logps/chosen": -590.9288940429688, "logps/rejected": -746.0863037109375, "loss": 0.4088, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.4452788829803467, "rewards/margins": 1.6681804656982422, "rewards/rejected": -5.113459587097168, "step": 4250 }, { "epoch": 0.56, "grad_norm": 26.5, "learning_rate": 2.4354804866063684e-06, "logits/chosen": 1.1707557439804077, "logits/rejected": 1.9762599468231201, "logps/chosen": -612.4578857421875, "logps/rejected": -760.2510986328125, "loss": 0.3818, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.5203795433044434, "rewards/margins": 1.672025442123413, "rewards/rejected": -5.1924052238464355, "step": 4260 }, { "epoch": 0.56, "grad_norm": 16.25, "learning_rate": 2.424064352583964e-06, "logits/chosen": 1.2009286880493164, "logits/rejected": 2.1041266918182373, "logps/chosen": -564.6832275390625, "logps/rejected": -678.2476806640625, "loss": 0.4951, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.084103584289551, "rewards/margins": 1.2054665088653564, "rewards/rejected": -4.28956937789917, "step": 4270 }, { "epoch": 0.56, "grad_norm": 8.1875, "learning_rate": 2.4126498032600403e-06, "logits/chosen": 0.9922968149185181, "logits/rejected": 1.713650107383728, "logps/chosen": -616.302734375, "logps/rejected": -700.6239624023438, "loss": 0.5001, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2124435901641846, "rewards/margins": 1.3036648035049438, "rewards/rejected": -4.51610803604126, "step": 4280 }, { "epoch": 0.56, "grad_norm": 22.625, "learning_rate": 2.401237076844445e-06, "logits/chosen": 1.120624303817749, "logits/rejected": 1.7306255102157593, "logps/chosen": -510.9451599121094, "logps/rejected": -606.6317749023438, "loss": 0.5167, "rewards/accuracies": 0.75, "rewards/chosen": -2.5382046699523926, "rewards/margins": 1.3079121112823486, "rewards/rejected": -3.8461170196533203, "step": 4290 }, { "epoch": 0.56, "grad_norm": 18.375, "learning_rate": 2.38982641150898e-06, "logits/chosen": 1.0842640399932861, "logits/rejected": 1.4181780815124512, "logps/chosen": -489.26507568359375, "logps/rejected": -598.4462280273438, "loss": 0.517, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7224583625793457, "rewards/margins": 1.2289758920669556, "rewards/rejected": -3.9514336585998535, "step": 4300 }, { "epoch": 0.56, "eval_logits/chosen": 1.854472279548645, "eval_logits/rejected": 2.621220588684082, "eval_logps/chosen": -570.7054443359375, "eval_logps/rejected": -689.4329833984375, "eval_loss": 0.5042153596878052, "eval_rewards/accuracies": 0.737500011920929, "eval_rewards/chosen": -3.0608439445495605, "eval_rewards/margins": 1.3876967430114746, "eval_rewards/rejected": -4.448540687561035, "eval_runtime": 1592.3251, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 4300 }, { "epoch": 0.56, "grad_norm": 19.5, "learning_rate": 2.3784180453824414e-06, "logits/chosen": 1.0778725147247314, "logits/rejected": 2.368303060531616, "logps/chosen": -607.5311889648438, "logps/rejected": -722.4293823242188, "loss": 0.4607, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2945079803466797, "rewards/margins": 1.4340566396713257, "rewards/rejected": -4.728564739227295, "step": 4310 }, { "epoch": 0.57, "grad_norm": 17.0, "learning_rate": 2.367012216545638e-06, "logits/chosen": 0.9011770486831665, "logits/rejected": 1.6108005046844482, "logps/chosen": -679.2977905273438, "logps/rejected": -808.9659423828125, "loss": 0.4776, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.7852683067321777, "rewards/margins": 1.5607531070709229, "rewards/rejected": -5.34602165222168, "step": 4320 }, { "epoch": 0.57, "grad_norm": 23.875, "learning_rate": 2.3556091630264294e-06, "logits/chosen": 1.6703239679336548, "logits/rejected": 2.2463738918304443, "logps/chosen": -645.5777587890625, "logps/rejected": -776.7327270507812, "loss": 0.4236, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.643219470977783, "rewards/margins": 1.4833171367645264, "rewards/rejected": -5.1265363693237305, "step": 4330 }, { "epoch": 0.57, "grad_norm": 21.375, "learning_rate": 2.344209122794757e-06, "logits/chosen": 1.0503277778625488, "logits/rejected": 2.3379592895507812, "logps/chosen": -627.8720703125, "logps/rejected": -737.0346069335938, "loss": 0.485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7017154693603516, "rewards/margins": 1.5395132303237915, "rewards/rejected": -5.241229057312012, "step": 4340 }, { "epoch": 0.57, "grad_norm": 14.3125, "learning_rate": 2.3328123337576787e-06, "logits/chosen": 1.0765559673309326, "logits/rejected": 1.8498785495758057, "logps/chosen": -631.2354125976562, "logps/rejected": -735.3282470703125, "loss": 0.5706, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.5931332111358643, "rewards/margins": 1.1502000093460083, "rewards/rejected": -4.743332862854004, "step": 4350 }, { "epoch": 0.57, "grad_norm": 15.125, "learning_rate": 2.3214190337544017e-06, "logits/chosen": 0.7766814827919006, "logits/rejected": 1.5930789709091187, "logps/chosen": -567.65576171875, "logps/rejected": -674.5923461914062, "loss": 0.5605, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2872467041015625, "rewards/margins": 1.0767381191253662, "rewards/rejected": -4.363985061645508, "step": 4360 }, { "epoch": 0.57, "grad_norm": 14.1875, "learning_rate": 2.310029460551323e-06, "logits/chosen": 0.7130604982376099, "logits/rejected": 1.5312939882278442, "logps/chosen": -561.925537109375, "logps/rejected": -691.440673828125, "loss": 0.4895, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.86440372467041, "rewards/margins": 1.4014947414398193, "rewards/rejected": -4.265898704528809, "step": 4370 }, { "epoch": 0.57, "grad_norm": 11.625, "learning_rate": 2.2986438518370645e-06, "logits/chosen": 0.2056771069765091, "logits/rejected": 1.5509610176086426, "logps/chosen": -534.5150146484375, "logps/rejected": -645.1749267578125, "loss": 0.3948, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.609005928039551, "rewards/margins": 1.5315120220184326, "rewards/rejected": -4.140517711639404, "step": 4380 }, { "epoch": 0.57, "grad_norm": 10.1875, "learning_rate": 2.2872624452175123e-06, "logits/chosen": 0.7365008592605591, "logits/rejected": 1.443713903427124, "logps/chosen": -478.4400329589844, "logps/rejected": -601.012451171875, "loss": 0.4489, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.5473170280456543, "rewards/margins": 1.3967925310134888, "rewards/rejected": -3.9441094398498535, "step": 4390 }, { "epoch": 0.58, "grad_norm": 12.125, "learning_rate": 2.2758854782108584e-06, "logits/chosen": 0.8305877447128296, "logits/rejected": 1.501903772354126, "logps/chosen": -498.0873107910156, "logps/rejected": -665.9342041015625, "loss": 0.3693, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.7014124393463135, "rewards/margins": 1.7031335830688477, "rewards/rejected": -4.404545783996582, "step": 4400 }, { "epoch": 0.58, "eval_logits/chosen": 1.8050862550735474, "eval_logits/rejected": 2.517800807952881, "eval_logps/chosen": -591.6002197265625, "eval_logps/rejected": -700.556396484375, "eval_loss": 0.4968615472316742, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -3.2697925567626953, "eval_rewards/margins": 1.2899818420410156, "eval_rewards/rejected": -4.559774875640869, "eval_runtime": 1591.5803, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 4400 }, { "epoch": 0.58, "grad_norm": 15.5, "learning_rate": 2.2645131882426458e-06, "logits/chosen": 0.9317380785942078, "logits/rejected": 1.8292909860610962, "logps/chosen": -587.1124267578125, "logps/rejected": -757.9627685546875, "loss": 0.4073, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.214625835418701, "rewards/margins": 1.690768837928772, "rewards/rejected": -4.905394554138184, "step": 4410 }, { "epoch": 0.58, "grad_norm": 9.9375, "learning_rate": 2.2531458126408154e-06, "logits/chosen": 1.728948950767517, "logits/rejected": 1.7796413898468018, "logps/chosen": -652.2147216796875, "logps/rejected": -797.2843627929688, "loss": 0.5817, "rewards/accuracies": 0.75, "rewards/chosen": -3.8443686962127686, "rewards/margins": 1.2194750308990479, "rewards/rejected": -5.063844203948975, "step": 4420 }, { "epoch": 0.58, "grad_norm": 20.375, "learning_rate": 2.2417835886307452e-06, "logits/chosen": 1.2968741655349731, "logits/rejected": 1.9664971828460693, "logps/chosen": -571.7467041015625, "logps/rejected": -706.6888427734375, "loss": 0.4778, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.410456895828247, "rewards/margins": 1.3251036405563354, "rewards/rejected": -4.735560417175293, "step": 4430 }, { "epoch": 0.58, "grad_norm": 11.0625, "learning_rate": 2.2304267533303075e-06, "logits/chosen": 1.0025638341903687, "logits/rejected": 2.462754011154175, "logps/chosen": -603.9998168945312, "logps/rejected": -628.0054321289062, "loss": 0.517, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.4204928874969482, "rewards/margins": 1.0212152004241943, "rewards/rejected": -4.441708087921143, "step": 4440 }, { "epoch": 0.58, "grad_norm": 15.8125, "learning_rate": 2.219075543744918e-06, "logits/chosen": 1.2630012035369873, "logits/rejected": 1.9615631103515625, "logps/chosen": -565.0428466796875, "logps/rejected": -648.239013671875, "loss": 0.6442, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.4224343299865723, "rewards/margins": 0.932254433631897, "rewards/rejected": -4.354689121246338, "step": 4450 }, { "epoch": 0.58, "grad_norm": 23.75, "learning_rate": 2.207730196762589e-06, "logits/chosen": 0.7253775596618652, "logits/rejected": 0.9075161814689636, "logps/chosen": -567.9874267578125, "logps/rejected": -690.0647583007812, "loss": 0.4821, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9466922283172607, "rewards/margins": 1.2171391248703003, "rewards/rejected": -4.163830757141113, "step": 4460 }, { "epoch": 0.58, "grad_norm": 17.375, "learning_rate": 2.1963909491489846e-06, "logits/chosen": 0.6194896697998047, "logits/rejected": 1.25758957862854, "logps/chosen": -557.9462890625, "logps/rejected": -642.5169677734375, "loss": 0.4892, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9200026988983154, "rewards/margins": 1.0755693912506104, "rewards/rejected": -3.9955718517303467, "step": 4470 }, { "epoch": 0.59, "grad_norm": 12.1875, "learning_rate": 2.185058037542486e-06, "logits/chosen": 0.48488321900367737, "logits/rejected": 0.8963411450386047, "logps/chosen": -525.1871948242188, "logps/rejected": -625.0621948242188, "loss": 0.5098, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7714436054229736, "rewards/margins": 1.1916885375976562, "rewards/rejected": -3.963132381439209, "step": 4480 }, { "epoch": 0.59, "grad_norm": 17.25, "learning_rate": 2.173731698449244e-06, "logits/chosen": 0.5312266945838928, "logits/rejected": 1.2149841785430908, "logps/chosen": -601.7564697265625, "logps/rejected": -717.5197143554688, "loss": 0.4883, "rewards/accuracies": 0.75, "rewards/chosen": -3.1194474697113037, "rewards/margins": 1.1115381717681885, "rewards/rejected": -4.230985641479492, "step": 4490 }, { "epoch": 0.59, "grad_norm": 14.875, "learning_rate": 2.1624121682382495e-06, "logits/chosen": 0.9723777770996094, "logits/rejected": 1.461883783340454, "logps/chosen": -538.8240966796875, "logps/rejected": -631.4890747070312, "loss": 0.481, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7736334800720215, "rewards/margins": 1.180018663406372, "rewards/rejected": -3.9536521434783936, "step": 4500 }, { "epoch": 0.59, "eval_logits/chosen": 1.3648182153701782, "eval_logits/rejected": 2.0328919887542725, "eval_logps/chosen": -545.3853149414062, "eval_logps/rejected": -640.7147827148438, "eval_loss": 0.48925405740737915, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -2.807643175125122, "eval_rewards/margins": 1.1537160873413086, "eval_rewards/rejected": -3.9613587856292725, "eval_runtime": 1591.7001, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 4500 }, { "epoch": 0.59, "grad_norm": 12.375, "learning_rate": 2.1510996831363993e-06, "logits/chosen": 0.13078482449054718, "logits/rejected": 1.3275985717773438, "logps/chosen": -543.0574340820312, "logps/rejected": -632.9281616210938, "loss": 0.4247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.831533908843994, "rewards/margins": 1.3064507246017456, "rewards/rejected": -4.137984275817871, "step": 4510 }, { "epoch": 0.59, "grad_norm": 9.375, "learning_rate": 2.139794479223565e-06, "logits/chosen": 0.3153165578842163, "logits/rejected": 1.1119401454925537, "logps/chosen": -571.6793823242188, "logps/rejected": -646.7637939453125, "loss": 0.4975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9295685291290283, "rewards/margins": 1.1337769031524658, "rewards/rejected": -4.063345432281494, "step": 4520 }, { "epoch": 0.59, "grad_norm": 15.25, "learning_rate": 2.128496792427669e-06, "logits/chosen": 0.5227512121200562, "logits/rejected": 1.2176361083984375, "logps/chosen": -530.2662353515625, "logps/rejected": -652.4630126953125, "loss": 0.4378, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.744332790374756, "rewards/margins": 1.3382190465927124, "rewards/rejected": -4.082551956176758, "step": 4530 }, { "epoch": 0.59, "grad_norm": 19.875, "learning_rate": 2.117206858519758e-06, "logits/chosen": 1.0087593793869019, "logits/rejected": 1.6051127910614014, "logps/chosen": -527.2972412109375, "logps/rejected": -705.387451171875, "loss": 0.399, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8391306400299072, "rewards/margins": 1.7159910202026367, "rewards/rejected": -4.555121421813965, "step": 4540 }, { "epoch": 0.6, "grad_norm": 21.25, "learning_rate": 2.1059249131090844e-06, "logits/chosen": 0.5082886219024658, "logits/rejected": 1.4257456064224243, "logps/chosen": -575.4322509765625, "logps/rejected": -634.91845703125, "loss": 0.5462, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.027864456176758, "rewards/margins": 1.0270512104034424, "rewards/rejected": -4.054915428161621, "step": 4550 }, { "epoch": 0.6, "grad_norm": 21.5, "learning_rate": 2.094651191638189e-06, "logits/chosen": 0.4393271803855896, "logits/rejected": 1.1420228481292725, "logps/chosen": -534.1982421875, "logps/rejected": -623.5012817382812, "loss": 0.522, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.76639986038208, "rewards/margins": 1.1037254333496094, "rewards/rejected": -3.8701255321502686, "step": 4560 }, { "epoch": 0.6, "grad_norm": 38.25, "learning_rate": 2.0833859293779867e-06, "logits/chosen": 1.1747925281524658, "logits/rejected": 1.879563570022583, "logps/chosen": -539.6349487304688, "logps/rejected": -643.480712890625, "loss": 0.5247, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.930851697921753, "rewards/margins": 1.1933619976043701, "rewards/rejected": -4.124213218688965, "step": 4570 }, { "epoch": 0.6, "grad_norm": 32.0, "learning_rate": 2.0721293614228568e-06, "logits/chosen": 1.0092575550079346, "logits/rejected": 1.8891198635101318, "logps/chosen": -550.1498413085938, "logps/rejected": -662.275634765625, "loss": 0.5383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.221567153930664, "rewards/margins": 1.1925783157348633, "rewards/rejected": -4.414145469665527, "step": 4580 }, { "epoch": 0.6, "grad_norm": 17.25, "learning_rate": 2.060881722685742e-06, "logits/chosen": 1.0075985193252563, "logits/rejected": 1.9096540212631226, "logps/chosen": -592.787353515625, "logps/rejected": -713.7469482421875, "loss": 0.4931, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3248486518859863, "rewards/margins": 1.3827643394470215, "rewards/rejected": -4.70761251449585, "step": 4590 }, { "epoch": 0.6, "grad_norm": 12.9375, "learning_rate": 2.049643247893235e-06, "logits/chosen": 1.0936925411224365, "logits/rejected": 1.6271178722381592, "logps/chosen": -523.0247802734375, "logps/rejected": -647.8020629882812, "loss": 0.4696, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0114166736602783, "rewards/margins": 1.296616792678833, "rewards/rejected": -4.308033466339111, "step": 4600 }, { "epoch": 0.6, "eval_logits/chosen": 1.940106987953186, "eval_logits/rejected": 2.6733126640319824, "eval_logps/chosen": -598.3125, "eval_logps/rejected": -704.406494140625, "eval_loss": 0.4945114850997925, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -3.3369147777557373, "eval_rewards/margins": 1.261361002922058, "eval_rewards/rejected": -4.598275661468506, "eval_runtime": 1591.7121, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 4600 }, { "epoch": 0.6, "grad_norm": 11.25, "learning_rate": 2.0384141715806903e-06, "logits/chosen": 0.8319090008735657, "logits/rejected": 1.6114375591278076, "logps/chosen": -647.0298461914062, "logps/rejected": -705.9091186523438, "loss": 0.5693, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.422800064086914, "rewards/margins": 0.8977988958358765, "rewards/rejected": -4.320598602294922, "step": 4610 }, { "epoch": 0.6, "grad_norm": 14.1875, "learning_rate": 2.0271947280873255e-06, "logits/chosen": 0.5834625959396362, "logits/rejected": 1.2262613773345947, "logps/chosen": -615.0538330078125, "logps/rejected": -690.6492919921875, "loss": 0.4892, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2901012897491455, "rewards/margins": 0.9858062863349915, "rewards/rejected": -4.275907039642334, "step": 4620 }, { "epoch": 0.61, "grad_norm": 9.625, "learning_rate": 2.0159851515513302e-06, "logits/chosen": 0.6103604435920715, "logits/rejected": 1.439798355102539, "logps/chosen": -625.5350341796875, "logps/rejected": -733.4219360351562, "loss": 0.4369, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8628792762756348, "rewards/margins": 1.389877200126648, "rewards/rejected": -4.252756595611572, "step": 4630 }, { "epoch": 0.61, "grad_norm": 14.125, "learning_rate": 2.004785675904982e-06, "logits/chosen": 1.1228773593902588, "logits/rejected": 1.6704814434051514, "logps/chosen": -550.8262939453125, "logps/rejected": -630.0404052734375, "loss": 0.5813, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.9327898025512695, "rewards/margins": 0.9519754648208618, "rewards/rejected": -3.884765148162842, "step": 4640 }, { "epoch": 0.61, "grad_norm": 8.25, "learning_rate": 1.9935965348697624e-06, "logits/chosen": 0.7240558862686157, "logits/rejected": 0.9415324330329895, "logps/chosen": -542.6046142578125, "logps/rejected": -712.421142578125, "loss": 0.3923, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.919755220413208, "rewards/margins": 1.4323351383209229, "rewards/rejected": -4.352089881896973, "step": 4650 }, { "epoch": 0.61, "grad_norm": 28.0, "learning_rate": 1.9824179619514807e-06, "logits/chosen": 0.8619499206542969, "logits/rejected": 1.7239103317260742, "logps/chosen": -523.3920288085938, "logps/rejected": -648.755126953125, "loss": 0.4401, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8888051509857178, "rewards/margins": 1.20088791847229, "rewards/rejected": -4.089693069458008, "step": 4660 }, { "epoch": 0.61, "grad_norm": 17.125, "learning_rate": 1.9712501904354004e-06, "logits/chosen": 1.0945820808410645, "logits/rejected": 1.327915072441101, "logps/chosen": -570.0567016601562, "logps/rejected": -677.2376708984375, "loss": 0.5679, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.9967522621154785, "rewards/margins": 0.9667608141899109, "rewards/rejected": -3.963512420654297, "step": 4670 }, { "epoch": 0.61, "grad_norm": 13.25, "learning_rate": 1.960093453381369e-06, "logits/chosen": 1.1339690685272217, "logits/rejected": 2.105699062347412, "logps/chosen": -542.7904663085938, "logps/rejected": -632.7266845703125, "loss": 0.4646, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0150303840637207, "rewards/margins": 1.2327334880828857, "rewards/rejected": -4.2477641105651855, "step": 4680 }, { "epoch": 0.61, "grad_norm": 20.5, "learning_rate": 1.948947983618962e-06, "logits/chosen": 0.6128058433532715, "logits/rejected": 1.3241350650787354, "logps/chosen": -590.0193481445312, "logps/rejected": -701.9865112304688, "loss": 0.4458, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8744754791259766, "rewards/margins": 1.3885873556137085, "rewards/rejected": -4.263062953948975, "step": 4690 }, { "epoch": 0.62, "grad_norm": 9.9375, "learning_rate": 1.937814013742611e-06, "logits/chosen": 0.673147976398468, "logits/rejected": 1.3856855630874634, "logps/chosen": -479.01611328125, "logps/rejected": -614.7681884765625, "loss": 0.4437, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6545872688293457, "rewards/margins": 1.4752554893493652, "rewards/rejected": -4.129843235015869, "step": 4700 }, { "epoch": 0.62, "eval_logits/chosen": 1.2695993185043335, "eval_logits/rejected": 2.05474853515625, "eval_logps/chosen": -545.9228515625, "eval_logps/rejected": -653.1788330078125, "eval_loss": 0.4940186142921448, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -2.813018321990967, "eval_rewards/margins": 1.2729805707931519, "eval_rewards/rejected": -4.085999488830566, "eval_runtime": 1591.5189, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 4700 }, { "epoch": 0.62, "grad_norm": 15.375, "learning_rate": 1.9266917761067617e-06, "logits/chosen": 0.7281503677368164, "logits/rejected": 1.7401161193847656, "logps/chosen": -542.7756958007812, "logps/rejected": -641.5377807617188, "loss": 0.5409, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9858946800231934, "rewards/margins": 1.0609354972839355, "rewards/rejected": -4.046829700469971, "step": 4710 }, { "epoch": 0.62, "grad_norm": 13.125, "learning_rate": 1.915581502821017e-06, "logits/chosen": 0.4619746804237366, "logits/rejected": 1.4797730445861816, "logps/chosen": -514.7449340820312, "logps/rejected": -664.6442260742188, "loss": 0.4787, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9499011039733887, "rewards/margins": 1.4481924772262573, "rewards/rejected": -4.398093223571777, "step": 4720 }, { "epoch": 0.62, "grad_norm": 23.875, "learning_rate": 1.9044834257452997e-06, "logits/chosen": 0.49240007996559143, "logits/rejected": 1.0757876634597778, "logps/chosen": -588.3216552734375, "logps/rejected": -722.0255126953125, "loss": 0.6945, "rewards/accuracies": 0.75, "rewards/chosen": -3.253741502761841, "rewards/margins": 1.194266676902771, "rewards/rejected": -4.448008060455322, "step": 4730 }, { "epoch": 0.62, "grad_norm": 13.0, "learning_rate": 1.893397776485006e-06, "logits/chosen": 0.3050432801246643, "logits/rejected": 1.475989580154419, "logps/chosen": -574.0492553710938, "logps/rejected": -703.4495849609375, "loss": 0.4284, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9173645973205566, "rewards/margins": 1.7435029745101929, "rewards/rejected": -4.660867691040039, "step": 4740 }, { "epoch": 0.62, "grad_norm": 10.3125, "learning_rate": 1.8823247863861804e-06, "logits/chosen": 0.5984830856323242, "logits/rejected": 1.160872459411621, "logps/chosen": -553.18701171875, "logps/rejected": -664.4539184570312, "loss": 0.5087, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.7396388053894043, "rewards/margins": 0.9977794885635376, "rewards/rejected": -3.7374184131622314, "step": 4750 }, { "epoch": 0.62, "grad_norm": 25.25, "learning_rate": 1.8712646865306822e-06, "logits/chosen": 0.5051913857460022, "logits/rejected": 1.2967312335968018, "logps/chosen": -573.8558959960938, "logps/rejected": -653.6807861328125, "loss": 0.5435, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.8586268424987793, "rewards/margins": 1.1963424682617188, "rewards/rejected": -4.05496883392334, "step": 4760 }, { "epoch": 0.62, "grad_norm": 11.375, "learning_rate": 1.8602177077313631e-06, "logits/chosen": 0.6085312962532043, "logits/rejected": 1.7123191356658936, "logps/chosen": -513.794189453125, "logps/rejected": -625.562255859375, "loss": 0.4743, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6487550735473633, "rewards/margins": 1.3480517864227295, "rewards/rejected": -3.996807098388672, "step": 4770 }, { "epoch": 0.63, "grad_norm": 10.1875, "learning_rate": 1.8491840805272546e-06, "logits/chosen": 0.7332175374031067, "logits/rejected": 1.728525161743164, "logps/chosen": -543.1729736328125, "logps/rejected": -671.0867919921875, "loss": 0.5058, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8513693809509277, "rewards/margins": 1.2838191986083984, "rewards/rejected": -4.135189056396484, "step": 4780 }, { "epoch": 0.63, "grad_norm": 20.875, "learning_rate": 1.8381640351787516e-06, "logits/chosen": 0.6986968517303467, "logits/rejected": 1.7445147037506104, "logps/chosen": -590.0587768554688, "logps/rejected": -642.3038330078125, "loss": 0.5453, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.819589138031006, "rewards/margins": 0.9962056279182434, "rewards/rejected": -3.8157947063446045, "step": 4790 }, { "epoch": 0.63, "grad_norm": 32.0, "learning_rate": 1.8271578016628122e-06, "logits/chosen": 1.1015398502349854, "logits/rejected": 1.974747657775879, "logps/chosen": -560.9954223632812, "logps/rejected": -675.0828247070312, "loss": 0.4492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9392876625061035, "rewards/margins": 1.3561334609985352, "rewards/rejected": -4.2954206466674805, "step": 4800 }, { "epoch": 0.63, "eval_logits/chosen": 1.535466194152832, "eval_logits/rejected": 2.339323043823242, "eval_logps/chosen": -541.89599609375, "eval_logps/rejected": -651.1524047851562, "eval_loss": 0.4963241219520569, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -2.772749900817871, "eval_rewards/margins": 1.292984962463379, "eval_rewards/rejected": -4.065734386444092, "eval_runtime": 1592.0484, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 4800 }, { "epoch": 0.63, "grad_norm": 16.625, "learning_rate": 1.8161656096681546e-06, "logits/chosen": 0.5773103833198547, "logits/rejected": 1.7860008478164673, "logps/chosen": -491.50469970703125, "logps/rejected": -664.9474487304688, "loss": 0.4972, "rewards/accuracies": 0.75, "rewards/chosen": -2.8516623973846436, "rewards/margins": 1.4962561130523682, "rewards/rejected": -4.347918510437012, "step": 4810 }, { "epoch": 0.63, "grad_norm": 13.375, "learning_rate": 1.8051876885904645e-06, "logits/chosen": 0.4815793037414551, "logits/rejected": 1.3946386575698853, "logps/chosen": -519.1699829101562, "logps/rejected": -677.970703125, "loss": 0.3626, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5333261489868164, "rewards/margins": 1.677099585533142, "rewards/rejected": -4.21042537689209, "step": 4820 }, { "epoch": 0.63, "grad_norm": 14.375, "learning_rate": 1.7942242675276098e-06, "logits/chosen": 1.3374228477478027, "logits/rejected": 1.18678879737854, "logps/chosen": -529.24560546875, "logps/rejected": -640.7904052734375, "loss": 0.5131, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8369922637939453, "rewards/margins": 1.2228530645370483, "rewards/rejected": -4.059844970703125, "step": 4830 }, { "epoch": 0.63, "grad_norm": 9.9375, "learning_rate": 1.783275575274856e-06, "logits/chosen": 0.8641239404678345, "logits/rejected": 1.1198005676269531, "logps/chosen": -567.7559814453125, "logps/rejected": -662.1914672851562, "loss": 0.5847, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.185612440109253, "rewards/margins": 1.0657835006713867, "rewards/rejected": -4.251395225524902, "step": 4840 }, { "epoch": 0.63, "grad_norm": 20.625, "learning_rate": 1.7723418403200943e-06, "logits/chosen": 0.7207244038581848, "logits/rejected": 1.9345829486846924, "logps/chosen": -624.4559936523438, "logps/rejected": -741.6597290039062, "loss": 0.4056, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3388404846191406, "rewards/margins": 1.618586778640747, "rewards/rejected": -4.95742654800415, "step": 4850 }, { "epoch": 0.64, "grad_norm": 15.8125, "learning_rate": 1.7614232908390748e-06, "logits/chosen": 0.21066589653491974, "logits/rejected": 1.2674895524978638, "logps/chosen": -590.947998046875, "logps/rejected": -709.72998046875, "loss": 0.5172, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2923660278320312, "rewards/margins": 1.3251597881317139, "rewards/rejected": -4.617525577545166, "step": 4860 }, { "epoch": 0.64, "grad_norm": 17.625, "learning_rate": 1.7505201546906398e-06, "logits/chosen": 0.9584872126579285, "logits/rejected": 2.0030558109283447, "logps/chosen": -533.1852416992188, "logps/rejected": -680.3133544921875, "loss": 0.3596, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0049993991851807, "rewards/margins": 1.8188884258270264, "rewards/rejected": -4.823887825012207, "step": 4870 }, { "epoch": 0.64, "grad_norm": 8.875, "learning_rate": 1.7396326594119717e-06, "logits/chosen": 0.8650108575820923, "logits/rejected": 1.0653468370437622, "logps/chosen": -557.3784790039062, "logps/rejected": -722.6024169921875, "loss": 0.405, "rewards/accuracies": 0.8125, "rewards/chosen": -3.025503396987915, "rewards/margins": 1.6017059087753296, "rewards/rejected": -4.627209663391113, "step": 4880 }, { "epoch": 0.64, "grad_norm": 6.53125, "learning_rate": 1.7287610322138449e-06, "logits/chosen": 0.7085530161857605, "logits/rejected": 1.709802269935608, "logps/chosen": -580.3453979492188, "logps/rejected": -651.3648681640625, "loss": 0.55, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1891696453094482, "rewards/margins": 1.127560019493103, "rewards/rejected": -4.316729545593262, "step": 4890 }, { "epoch": 0.64, "grad_norm": 26.125, "learning_rate": 1.7179054999758817e-06, "logits/chosen": 0.18700245022773743, "logits/rejected": 1.1381707191467285, "logps/chosen": -652.1790161132812, "logps/rejected": -758.5651245117188, "loss": 0.5163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.5247459411621094, "rewards/margins": 1.343051552772522, "rewards/rejected": -4.867797374725342, "step": 4900 }, { "epoch": 0.64, "eval_logits/chosen": 1.2216382026672363, "eval_logits/rejected": 2.0201332569122314, "eval_logps/chosen": -599.6018676757812, "eval_logps/rejected": -721.0643310546875, "eval_loss": 0.501664936542511, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -3.3498075008392334, "eval_rewards/margins": 1.4150458574295044, "eval_rewards/rejected": -4.7648539543151855, "eval_runtime": 1591.3817, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 4900 }, { "epoch": 0.64, "grad_norm": 12.4375, "learning_rate": 1.7070662892418225e-06, "logits/chosen": 0.6338080167770386, "logits/rejected": 2.215214490890503, "logps/chosen": -604.2860717773438, "logps/rejected": -725.52392578125, "loss": 0.4728, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.482276201248169, "rewards/margins": 1.4508180618286133, "rewards/rejected": -4.933094501495361, "step": 4910 }, { "epoch": 0.64, "grad_norm": 23.0, "learning_rate": 1.6962436262147913e-06, "logits/chosen": 0.46580928564071655, "logits/rejected": 1.9383723735809326, "logps/chosen": -662.3033447265625, "logps/rejected": -741.284912109375, "loss": 0.5509, "rewards/accuracies": 0.75, "rewards/chosen": -3.588061571121216, "rewards/margins": 1.350954294204712, "rewards/rejected": -4.939015865325928, "step": 4920 }, { "epoch": 0.65, "grad_norm": 38.75, "learning_rate": 1.6854377367525814e-06, "logits/chosen": 0.31709781289100647, "logits/rejected": 1.1077455282211304, "logps/chosen": -584.2669067382812, "logps/rejected": -666.3541870117188, "loss": 0.5443, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.418966293334961, "rewards/margins": 1.1538652181625366, "rewards/rejected": -4.572831153869629, "step": 4930 }, { "epoch": 0.65, "grad_norm": 9.0625, "learning_rate": 1.6746488463629362e-06, "logits/chosen": 0.04629017040133476, "logits/rejected": 1.4084089994430542, "logps/chosen": -565.3751220703125, "logps/rejected": -666.2435302734375, "loss": 0.4557, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.0415847301483154, "rewards/margins": 1.346897840499878, "rewards/rejected": -4.388482570648193, "step": 4940 }, { "epoch": 0.65, "grad_norm": 13.4375, "learning_rate": 1.6638771801988483e-06, "logits/chosen": 0.6879199147224426, "logits/rejected": 1.345462441444397, "logps/chosen": -615.6664428710938, "logps/rejected": -784.7410278320312, "loss": 0.4795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4892544746398926, "rewards/margins": 1.5444860458374023, "rewards/rejected": -5.033740043640137, "step": 4950 }, { "epoch": 0.65, "grad_norm": 24.625, "learning_rate": 1.653122963053857e-06, "logits/chosen": 0.6268502473831177, "logits/rejected": 1.3769195079803467, "logps/chosen": -597.7298583984375, "logps/rejected": -719.7408447265625, "loss": 0.3973, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.3582911491394043, "rewards/margins": 1.5615208148956299, "rewards/rejected": -4.919812202453613, "step": 4960 }, { "epoch": 0.65, "grad_norm": 8.125, "learning_rate": 1.6423864193573606e-06, "logits/chosen": 0.6681777834892273, "logits/rejected": 0.9838783144950867, "logps/chosen": -571.2242431640625, "logps/rejected": -712.8031005859375, "loss": 0.496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1494784355163574, "rewards/margins": 1.4304306507110596, "rewards/rejected": -4.579909324645996, "step": 4970 }, { "epoch": 0.65, "grad_norm": 13.375, "learning_rate": 1.6316677731699286e-06, "logits/chosen": 0.48633041977882385, "logits/rejected": 2.1906113624572754, "logps/chosen": -566.6742553710938, "logps/rejected": -676.7261962890625, "loss": 0.6053, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2935385704040527, "rewards/margins": 1.423719048500061, "rewards/rejected": -4.717257499694824, "step": 4980 }, { "epoch": 0.65, "grad_norm": 26.75, "learning_rate": 1.6209672481786302e-06, "logits/chosen": 0.36567243933677673, "logits/rejected": 1.2046464681625366, "logps/chosen": -582.448486328125, "logps/rejected": -712.8392944335938, "loss": 0.4767, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0652308464050293, "rewards/margins": 1.549925446510315, "rewards/rejected": -4.615156173706055, "step": 4990 }, { "epoch": 0.65, "grad_norm": 10.375, "learning_rate": 1.6102850676923616e-06, "logits/chosen": 0.5834435820579529, "logits/rejected": 1.3819458484649658, "logps/chosen": -551.6906127929688, "logps/rejected": -691.9273681640625, "loss": 0.488, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.029064655303955, "rewards/margins": 1.308930516242981, "rewards/rejected": -4.3379950523376465, "step": 5000 }, { "epoch": 0.65, "eval_logits/chosen": 1.141782522201538, "eval_logits/rejected": 1.9166334867477417, "eval_logps/chosen": -589.7007446289062, "eval_logps/rejected": -700.8106689453125, "eval_loss": 0.4917171597480774, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -3.2507972717285156, "eval_rewards/margins": 1.3115205764770508, "eval_rewards/rejected": -4.562317848205566, "eval_runtime": 1591.2907, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 5000 }, { "epoch": 0.66, "grad_norm": 17.0, "learning_rate": 1.5996214546371888e-06, "logits/chosen": 0.2420971840620041, "logits/rejected": 1.3805590867996216, "logps/chosen": -615.687744140625, "logps/rejected": -716.4863891601562, "loss": 0.531, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.3506503105163574, "rewards/margins": 1.3097456693649292, "rewards/rejected": -4.660396099090576, "step": 5010 }, { "epoch": 0.66, "grad_norm": 13.875, "learning_rate": 1.588976631551697e-06, "logits/chosen": 0.24625544250011444, "logits/rejected": 0.989874005317688, "logps/chosen": -597.4532470703125, "logps/rejected": -674.4581298828125, "loss": 0.5112, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.2421135902404785, "rewards/margins": 1.1127220392227173, "rewards/rejected": -4.354835510253906, "step": 5020 }, { "epoch": 0.66, "grad_norm": 10.8125, "learning_rate": 1.5783508205823412e-06, "logits/chosen": 0.15622951090335846, "logits/rejected": 1.6825730800628662, "logps/chosen": -613.8956298828125, "logps/rejected": -752.7511596679688, "loss": 0.4556, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3116347789764404, "rewards/margins": 1.4797947406768799, "rewards/rejected": -4.7914299964904785, "step": 5030 }, { "epoch": 0.66, "grad_norm": 13.1875, "learning_rate": 1.5677442434788143e-06, "logits/chosen": 0.4642793536186218, "logits/rejected": 0.683434247970581, "logps/chosen": -559.9010620117188, "logps/rejected": -681.8499755859375, "loss": 0.4602, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9780526161193848, "rewards/margins": 1.4143049716949463, "rewards/rejected": -4.392356872558594, "step": 5040 }, { "epoch": 0.66, "grad_norm": 5.4375, "learning_rate": 1.5571571215894181e-06, "logits/chosen": -0.12815335392951965, "logits/rejected": 1.1323789358139038, "logps/chosen": -601.626953125, "logps/rejected": -710.6660766601562, "loss": 0.4372, "rewards/accuracies": 0.75, "rewards/chosen": -3.0496914386749268, "rewards/margins": 1.4026107788085938, "rewards/rejected": -4.452301979064941, "step": 5050 }, { "epoch": 0.66, "grad_norm": 20.625, "learning_rate": 1.5465896758564452e-06, "logits/chosen": 0.6499842405319214, "logits/rejected": 1.215057134628296, "logps/chosen": -544.9089965820312, "logps/rejected": -654.0910034179688, "loss": 0.5112, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.162558078765869, "rewards/margins": 1.325282335281372, "rewards/rejected": -4.487840175628662, "step": 5060 }, { "epoch": 0.66, "grad_norm": 15.5, "learning_rate": 1.5360421268115653e-06, "logits/chosen": 0.1901804506778717, "logits/rejected": 0.6268029808998108, "logps/chosen": -530.2285766601562, "logps/rejected": -594.3751220703125, "loss": 0.4743, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7991414070129395, "rewards/margins": 1.081934928894043, "rewards/rejected": -3.8810763359069824, "step": 5070 }, { "epoch": 0.66, "grad_norm": 9.1875, "learning_rate": 1.5255146945712267e-06, "logits/chosen": 0.4935234487056732, "logits/rejected": 0.8733431100845337, "logps/chosen": -549.5220947265625, "logps/rejected": -681.1325073242188, "loss": 0.4263, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0330758094787598, "rewards/margins": 1.3714685440063477, "rewards/rejected": -4.404544353485107, "step": 5080 }, { "epoch": 0.67, "grad_norm": 30.75, "learning_rate": 1.5150075988320594e-06, "logits/chosen": 0.6490196585655212, "logits/rejected": 1.8105666637420654, "logps/chosen": -576.24755859375, "logps/rejected": -666.9951171875, "loss": 0.5292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.25989031791687, "rewards/margins": 1.3189462423324585, "rewards/rejected": -4.578836441040039, "step": 5090 }, { "epoch": 0.67, "grad_norm": 6.6875, "learning_rate": 1.5045210588662929e-06, "logits/chosen": 0.13218924403190613, "logits/rejected": 1.4612019062042236, "logps/chosen": -584.4468994140625, "logps/rejected": -697.099609375, "loss": 0.3606, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9089255332946777, "rewards/margins": 1.743561029434204, "rewards/rejected": -4.652486324310303, "step": 5100 }, { "epoch": 0.67, "eval_logits/chosen": 0.7813382744789124, "eval_logits/rejected": 1.5031211376190186, "eval_logps/chosen": -562.187744140625, "eval_logps/rejected": -667.6594848632812, "eval_loss": 0.49050119519233704, "eval_rewards/accuracies": 0.7459999918937683, "eval_rewards/chosen": -2.9756669998168945, "eval_rewards/margins": 1.2551382780075073, "eval_rewards/rejected": -4.23080587387085, "eval_runtime": 1591.2135, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 5100 }, { "epoch": 0.67, "grad_norm": 19.875, "learning_rate": 1.4940552935171781e-06, "logits/chosen": 0.20014624297618866, "logits/rejected": 0.6813384294509888, "logps/chosen": -586.2484130859375, "logps/rejected": -698.6951904296875, "loss": 0.5211, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9433157444000244, "rewards/margins": 1.3401107788085938, "rewards/rejected": -4.283426761627197, "step": 5110 }, { "epoch": 0.67, "grad_norm": 15.4375, "learning_rate": 1.483610521194419e-06, "logits/chosen": -0.20831915736198425, "logits/rejected": 0.7309791445732117, "logps/chosen": -551.2055053710938, "logps/rejected": -652.7286376953125, "loss": 0.5044, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.8443474769592285, "rewards/margins": 1.2581770420074463, "rewards/rejected": -4.102524757385254, "step": 5120 }, { "epoch": 0.67, "grad_norm": 16.25, "learning_rate": 1.4731869598696226e-06, "logits/chosen": -0.0020657808054238558, "logits/rejected": 0.6582788228988647, "logps/chosen": -581.8043823242188, "logps/rejected": -651.2916870117188, "loss": 0.579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0622143745422363, "rewards/margins": 1.0450094938278198, "rewards/rejected": -4.107223987579346, "step": 5130 }, { "epoch": 0.67, "grad_norm": 13.4375, "learning_rate": 1.4627848270717387e-06, "logits/chosen": -0.0938434973359108, "logits/rejected": 0.987653911113739, "logps/chosen": -549.9574584960938, "logps/rejected": -625.3163452148438, "loss": 0.5751, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.074310302734375, "rewards/margins": 1.1363435983657837, "rewards/rejected": -4.210653305053711, "step": 5140 }, { "epoch": 0.67, "grad_norm": 8.3125, "learning_rate": 1.4524043398825277e-06, "logits/chosen": -0.10383953154087067, "logits/rejected": 0.6164258718490601, "logps/chosen": -589.60107421875, "logps/rejected": -697.7528076171875, "loss": 0.4383, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9964165687561035, "rewards/margins": 1.383199691772461, "rewards/rejected": -4.3796162605285645, "step": 5150 }, { "epoch": 0.68, "grad_norm": 23.0, "learning_rate": 1.4420457149320299e-06, "logits/chosen": -0.2617081105709076, "logits/rejected": 0.8577947616577148, "logps/chosen": -570.8038330078125, "logps/rejected": -598.7860717773438, "loss": 0.5498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.934159278869629, "rewards/margins": 1.0436315536499023, "rewards/rejected": -3.977790355682373, "step": 5160 }, { "epoch": 0.68, "grad_norm": 6.6875, "learning_rate": 1.431709168394042e-06, "logits/chosen": 0.26271653175354004, "logits/rejected": 0.6604413390159607, "logps/chosen": -487.736328125, "logps/rejected": -591.9219360351562, "loss": 0.5466, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.592200517654419, "rewards/margins": 1.1626756191253662, "rewards/rejected": -3.7548763751983643, "step": 5170 }, { "epoch": 0.68, "grad_norm": 18.0, "learning_rate": 1.4213949159816059e-06, "logits/chosen": -0.018306344747543335, "logits/rejected": 1.2069590091705322, "logps/chosen": -524.5364990234375, "logps/rejected": -616.7649536132812, "loss": 0.4177, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.609614372253418, "rewards/margins": 1.407111406326294, "rewards/rejected": -4.016725540161133, "step": 5180 }, { "epoch": 0.68, "grad_norm": 14.375, "learning_rate": 1.4111031729425103e-06, "logits/chosen": 0.18678632378578186, "logits/rejected": 0.5336076617240906, "logps/chosen": -523.8223876953125, "logps/rejected": -633.44384765625, "loss": 0.4914, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.944256067276001, "rewards/margins": 1.1703563928604126, "rewards/rejected": -4.114611625671387, "step": 5190 }, { "epoch": 0.68, "grad_norm": 8.4375, "learning_rate": 1.4008341540547965e-06, "logits/chosen": 0.04942316561937332, "logits/rejected": 0.69202721118927, "logps/chosen": -545.2200927734375, "logps/rejected": -652.6435546875, "loss": 0.58, "rewards/accuracies": 0.6875, "rewards/chosen": -2.91707181930542, "rewards/margins": 1.0721800327301025, "rewards/rejected": -3.9892516136169434, "step": 5200 }, { "epoch": 0.68, "eval_logits/chosen": 0.5849885940551758, "eval_logits/rejected": 1.2839491367340088, "eval_logps/chosen": -552.4491577148438, "eval_logps/rejected": -654.7923583984375, "eval_loss": 0.4896867573261261, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -2.878281593322754, "eval_rewards/margins": 1.2238528728485107, "eval_rewards/rejected": -4.1021342277526855, "eval_runtime": 1591.3233, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 5200 }, { "epoch": 0.68, "grad_norm": 15.4375, "learning_rate": 1.3905880736222737e-06, "logits/chosen": 0.21044036746025085, "logits/rejected": 0.3255331516265869, "logps/chosen": -511.98419189453125, "logps/rejected": -632.8362426757812, "loss": 0.496, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.792902946472168, "rewards/margins": 1.1052320003509521, "rewards/rejected": -3.89813494682312, "step": 5210 }, { "epoch": 0.68, "grad_norm": 18.0, "learning_rate": 1.3803651454700531e-06, "logits/chosen": -0.14034347236156464, "logits/rejected": 0.3313951790332794, "logps/chosen": -500.55133056640625, "logps/rejected": -620.788818359375, "loss": 0.4916, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6808226108551025, "rewards/margins": 1.2449661493301392, "rewards/rejected": -3.9257888793945312, "step": 5220 }, { "epoch": 0.68, "grad_norm": 12.375, "learning_rate": 1.3701655829400773e-06, "logits/chosen": -0.19148483872413635, "logits/rejected": 0.5032289624214172, "logps/chosen": -535.7360229492188, "logps/rejected": -652.48583984375, "loss": 0.5032, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7009353637695312, "rewards/margins": 1.0817861557006836, "rewards/rejected": -3.782721757888794, "step": 5230 }, { "epoch": 0.69, "grad_norm": 14.0, "learning_rate": 1.3599895988866756e-06, "logits/chosen": -0.21712355315685272, "logits/rejected": 0.2153054028749466, "logps/chosen": -520.4691772460938, "logps/rejected": -643.4196166992188, "loss": 0.4404, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.611964702606201, "rewards/margins": 1.2944133281707764, "rewards/rejected": -3.9063782691955566, "step": 5240 }, { "epoch": 0.69, "grad_norm": 14.25, "learning_rate": 1.3498374056721198e-06, "logits/chosen": 0.18565845489501953, "logits/rejected": 0.5457652807235718, "logps/chosen": -583.9906616210938, "logps/rejected": -677.7162475585938, "loss": 0.5224, "rewards/accuracies": 0.75, "rewards/chosen": -3.0645644664764404, "rewards/margins": 1.1610159873962402, "rewards/rejected": -4.225581169128418, "step": 5250 }, { "epoch": 0.69, "grad_norm": 16.625, "learning_rate": 1.3397092151621883e-06, "logits/chosen": -0.12094113975763321, "logits/rejected": 0.7155014872550964, "logps/chosen": -605.60986328125, "logps/rejected": -717.340576171875, "loss": 0.5468, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.979651927947998, "rewards/margins": 1.100801944732666, "rewards/rejected": -4.080453872680664, "step": 5260 }, { "epoch": 0.69, "grad_norm": 15.0625, "learning_rate": 1.3296052387217484e-06, "logits/chosen": 0.2536582350730896, "logits/rejected": 0.5665210485458374, "logps/chosen": -551.7531127929688, "logps/rejected": -638.0416259765625, "loss": 0.5232, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.853527069091797, "rewards/margins": 1.0977308750152588, "rewards/rejected": -3.9512581825256348, "step": 5270 }, { "epoch": 0.69, "grad_norm": 20.5, "learning_rate": 1.3195256872103476e-06, "logits/chosen": 0.19485989212989807, "logits/rejected": 0.4308921694755554, "logps/chosen": -575.5075073242188, "logps/rejected": -699.4771728515625, "loss": 0.4572, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.144331932067871, "rewards/margins": 1.1805914640426636, "rewards/rejected": -4.324923515319824, "step": 5280 }, { "epoch": 0.69, "grad_norm": 8.25, "learning_rate": 1.3094707709778068e-06, "logits/chosen": 0.03303980454802513, "logits/rejected": 1.1130796670913696, "logps/chosen": -588.6350708007812, "logps/rejected": -650.6473388671875, "loss": 0.5473, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.109278440475464, "rewards/margins": 1.1929148435592651, "rewards/rejected": -4.302193641662598, "step": 5290 }, { "epoch": 0.69, "grad_norm": 21.5, "learning_rate": 1.2994406998598364e-06, "logits/chosen": 0.03543071821331978, "logits/rejected": 0.4372057020664215, "logps/chosen": -552.617919921875, "logps/rejected": -666.7288208007812, "loss": 0.5788, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1095051765441895, "rewards/margins": 1.2071665525436401, "rewards/rejected": -4.316671848297119, "step": 5300 }, { "epoch": 0.69, "eval_logits/chosen": 0.7114199995994568, "eval_logits/rejected": 1.4058908224105835, "eval_logps/chosen": -570.6942749023438, "eval_logps/rejected": -672.7390747070312, "eval_loss": 0.49004074931144714, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -3.060732126235962, "eval_rewards/margins": 1.2208691835403442, "eval_rewards/rejected": -4.281601428985596, "eval_runtime": 1592.4706, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 5300 }, { "epoch": 0.69, "grad_norm": 7.96875, "learning_rate": 1.2894356831736558e-06, "logits/chosen": 0.30859580636024475, "logits/rejected": 1.3365111351013184, "logps/chosen": -567.5665283203125, "logps/rejected": -685.6886596679688, "loss": 0.5104, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1270010471343994, "rewards/margins": 1.6088695526123047, "rewards/rejected": -4.735870838165283, "step": 5310 }, { "epoch": 0.7, "grad_norm": 11.125, "learning_rate": 1.2794559297136203e-06, "logits/chosen": 0.17694668471813202, "logits/rejected": 0.6812500953674316, "logps/chosen": -587.8646850585938, "logps/rejected": -652.458251953125, "loss": 0.546, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1099853515625, "rewards/margins": 1.0107700824737549, "rewards/rejected": -4.120755195617676, "step": 5320 }, { "epoch": 0.7, "grad_norm": 21.0, "learning_rate": 1.2695016477468724e-06, "logits/chosen": 0.2500815987586975, "logits/rejected": 0.7284664511680603, "logps/chosen": -609.7841796875, "logps/rejected": -668.8241577148438, "loss": 0.5396, "rewards/accuracies": 0.6875, "rewards/chosen": -3.290778398513794, "rewards/margins": 0.8813145756721497, "rewards/rejected": -4.172092914581299, "step": 5330 }, { "epoch": 0.7, "grad_norm": 12.25, "learning_rate": 1.2595730450089874e-06, "logits/chosen": 0.20059093832969666, "logits/rejected": 0.6966744065284729, "logps/chosen": -577.68603515625, "logps/rejected": -682.2274169921875, "loss": 0.4409, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.045767068862915, "rewards/margins": 1.3784563541412354, "rewards/rejected": -4.42422342300415, "step": 5340 }, { "epoch": 0.7, "grad_norm": 7.4375, "learning_rate": 1.2496703286996433e-06, "logits/chosen": 0.23528914153575897, "logits/rejected": 0.8474369049072266, "logps/chosen": -563.860107421875, "logps/rejected": -701.6685180664062, "loss": 0.3999, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1497082710266113, "rewards/margins": 1.4197697639465332, "rewards/rejected": -4.5694780349731445, "step": 5350 }, { "epoch": 0.7, "grad_norm": 22.0, "learning_rate": 1.2397937054782961e-06, "logits/chosen": 0.3095719516277313, "logits/rejected": 1.2036718130111694, "logps/chosen": -568.3956298828125, "logps/rejected": -663.9815063476562, "loss": 0.504, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.285660982131958, "rewards/margins": 1.2506592273712158, "rewards/rejected": -4.536319732666016, "step": 5360 }, { "epoch": 0.7, "grad_norm": 12.8125, "learning_rate": 1.2299433814598635e-06, "logits/chosen": 0.8002223968505859, "logits/rejected": 1.2919270992279053, "logps/chosen": -540.2605590820312, "logps/rejected": -667.7479858398438, "loss": 0.54, "rewards/accuracies": 0.75, "rewards/chosen": -3.3001084327697754, "rewards/margins": 1.1389198303222656, "rewards/rejected": -4.439028263092041, "step": 5370 }, { "epoch": 0.7, "grad_norm": 19.0, "learning_rate": 1.2201195622104265e-06, "logits/chosen": 0.4608491063117981, "logits/rejected": 1.0170303583145142, "logps/chosen": -602.1793212890625, "logps/rejected": -678.3692626953125, "loss": 0.4949, "rewards/accuracies": 0.75, "rewards/chosen": -3.355029582977295, "rewards/margins": 1.0179849863052368, "rewards/rejected": -4.3730149269104, "step": 5380 }, { "epoch": 0.71, "grad_norm": 13.6875, "learning_rate": 1.2103224527429417e-06, "logits/chosen": 0.8049987554550171, "logits/rejected": 0.7608388066291809, "logps/chosen": -533.5645751953125, "logps/rejected": -648.8416748046875, "loss": 0.4595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1001079082489014, "rewards/margins": 1.1306575536727905, "rewards/rejected": -4.2307658195495605, "step": 5390 }, { "epoch": 0.71, "grad_norm": 14.3125, "learning_rate": 1.2005522575129559e-06, "logits/chosen": 0.47020992636680603, "logits/rejected": 0.8666495084762573, "logps/chosen": -507.0211486816406, "logps/rejected": -639.0309448242188, "loss": 0.4138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9997565746307373, "rewards/margins": 1.4226839542388916, "rewards/rejected": -4.422440528869629, "step": 5400 }, { "epoch": 0.71, "eval_logits/chosen": 0.8969926238059998, "eval_logits/rejected": 1.6120911836624146, "eval_logps/chosen": -599.54638671875, "eval_logps/rejected": -706.5120239257812, "eval_loss": 0.4909508526325226, "eval_rewards/accuracies": 0.7515000104904175, "eval_rewards/chosen": -3.3492534160614014, "eval_rewards/margins": 1.270076870918274, "eval_rewards/rejected": -4.619329929351807, "eval_runtime": 1592.4682, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 5400 }, { "epoch": 0.71, "grad_norm": 8.625, "learning_rate": 1.1908091804143469e-06, "logits/chosen": -0.17856049537658691, "logits/rejected": 1.5493866205215454, "logps/chosen": -641.3067626953125, "logps/rejected": -742.3198852539062, "loss": 0.3837, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.4065234661102295, "rewards/margins": 1.6241868734359741, "rewards/rejected": -5.030710697174072, "step": 5410 }, { "epoch": 0.71, "grad_norm": 20.625, "learning_rate": 1.1810934247750649e-06, "logits/chosen": 0.0274839885532856, "logits/rejected": 0.9218126535415649, "logps/chosen": -621.7474365234375, "logps/rejected": -747.618896484375, "loss": 0.5001, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.40272855758667, "rewards/margins": 1.4314203262329102, "rewards/rejected": -4.834149360656738, "step": 5420 }, { "epoch": 0.71, "grad_norm": 34.75, "learning_rate": 1.1714051933528881e-06, "logits/chosen": 0.3180490732192993, "logits/rejected": 0.5688341856002808, "logps/chosen": -591.6409912109375, "logps/rejected": -710.7434692382812, "loss": 0.5916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.221553087234497, "rewards/margins": 1.2600384950637817, "rewards/rejected": -4.48159122467041, "step": 5430 }, { "epoch": 0.71, "grad_norm": 17.75, "learning_rate": 1.161744688331192e-06, "logits/chosen": 0.5358083844184875, "logits/rejected": 1.2289925813674927, "logps/chosen": -574.173828125, "logps/rejected": -725.67431640625, "loss": 0.49, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.5042128562927246, "rewards/margins": 1.3280826807022095, "rewards/rejected": -4.8322954177856445, "step": 5440 }, { "epoch": 0.71, "grad_norm": 38.5, "learning_rate": 1.152112111314733e-06, "logits/chosen": 0.5168737173080444, "logits/rejected": 0.9716085195541382, "logps/chosen": -652.9949951171875, "logps/rejected": -748.0339965820312, "loss": 0.6163, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.820329189300537, "rewards/margins": 0.945980429649353, "rewards/rejected": -4.76630973815918, "step": 5450 }, { "epoch": 0.71, "grad_norm": 10.75, "learning_rate": 1.142507663325439e-06, "logits/chosen": 0.05302376672625542, "logits/rejected": 0.6689623594284058, "logps/chosen": -641.9430541992188, "logps/rejected": -752.3848266601562, "loss": 0.445, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.3499228954315186, "rewards/margins": 1.3526192903518677, "rewards/rejected": -4.702542781829834, "step": 5460 }, { "epoch": 0.72, "grad_norm": 13.9375, "learning_rate": 1.132931544798211e-06, "logits/chosen": 0.10494127124547958, "logits/rejected": 0.8696743249893188, "logps/chosen": -627.115478515625, "logps/rejected": -754.4387817382812, "loss": 0.4932, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5821831226348877, "rewards/margins": 1.1890878677368164, "rewards/rejected": -4.771270751953125, "step": 5470 }, { "epoch": 0.72, "grad_norm": 21.5, "learning_rate": 1.1233839555767482e-06, "logits/chosen": -0.17560331523418427, "logits/rejected": 1.348668098449707, "logps/chosen": -647.2442626953125, "logps/rejected": -714.8629760742188, "loss": 0.5374, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.377307176589966, "rewards/margins": 1.3940346240997314, "rewards/rejected": -4.7713422775268555, "step": 5480 }, { "epoch": 0.72, "grad_norm": 14.625, "learning_rate": 1.1138650949093668e-06, "logits/chosen": -0.04205578565597534, "logits/rejected": 0.8225839734077454, "logps/chosen": -592.0736083984375, "logps/rejected": -687.6424560546875, "loss": 0.5378, "rewards/accuracies": 0.75, "rewards/chosen": -3.302886486053467, "rewards/margins": 1.1337714195251465, "rewards/rejected": -4.436657905578613, "step": 5490 }, { "epoch": 0.72, "grad_norm": 38.25, "learning_rate": 1.1043751614448543e-06, "logits/chosen": -0.006318402476608753, "logits/rejected": 0.9290812611579895, "logps/chosen": -679.0123901367188, "logps/rejected": -716.4671020507812, "loss": 0.5737, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.7354023456573486, "rewards/margins": 1.0157877206802368, "rewards/rejected": -4.751189708709717, "step": 5500 }, { "epoch": 0.72, "eval_logits/chosen": 0.6955203413963318, "eval_logits/rejected": 1.4061022996902466, "eval_logps/chosen": -583.0511474609375, "eval_logps/rejected": -689.724853515625, "eval_loss": 0.48981621861457825, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -3.1843008995056152, "eval_rewards/margins": 1.2671582698822021, "eval_rewards/rejected": -4.4514594078063965, "eval_runtime": 1592.2368, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 5500 }, { "epoch": 0.72, "grad_norm": 16.125, "learning_rate": 1.0949143532283107e-06, "logits/chosen": -0.15896812081336975, "logits/rejected": 0.8019599914550781, "logps/chosen": -567.5126953125, "logps/rejected": -690.685302734375, "loss": 0.4008, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.882983446121216, "rewards/margins": 1.5050808191299438, "rewards/rejected": -4.388064384460449, "step": 5510 }, { "epoch": 0.72, "grad_norm": 16.25, "learning_rate": 1.0854828676970275e-06, "logits/chosen": 0.022523891180753708, "logits/rejected": 0.5711814761161804, "logps/chosen": -562.1071166992188, "logps/rejected": -637.1767578125, "loss": 0.5859, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.2052435874938965, "rewards/margins": 0.9110970497131348, "rewards/rejected": -4.116340160369873, "step": 5520 }, { "epoch": 0.72, "grad_norm": 14.0625, "learning_rate": 1.076080901676361e-06, "logits/chosen": -0.07785852253437042, "logits/rejected": 1.264184594154358, "logps/chosen": -583.8575439453125, "logps/rejected": -688.28125, "loss": 0.5176, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1855790615081787, "rewards/margins": 1.4519331455230713, "rewards/rejected": -4.637512683868408, "step": 5530 }, { "epoch": 0.72, "grad_norm": 13.125, "learning_rate": 1.0667086513756234e-06, "logits/chosen": 0.41335564851760864, "logits/rejected": 1.4227181673049927, "logps/chosen": -571.003173828125, "logps/rejected": -673.028564453125, "loss": 0.4494, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.126842737197876, "rewards/margins": 1.4141932725906372, "rewards/rejected": -4.5410356521606445, "step": 5540 }, { "epoch": 0.73, "grad_norm": 20.875, "learning_rate": 1.0573663123839912e-06, "logits/chosen": 0.6815285682678223, "logits/rejected": 1.2222025394439697, "logps/chosen": -582.1532592773438, "logps/rejected": -683.1765747070312, "loss": 0.4605, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.343864917755127, "rewards/margins": 1.2274417877197266, "rewards/rejected": -4.5713067054748535, "step": 5550 }, { "epoch": 0.73, "grad_norm": 11.25, "learning_rate": 1.0480540796664251e-06, "logits/chosen": 0.10144902765750885, "logits/rejected": 0.8711962699890137, "logps/chosen": -566.8845825195312, "logps/rejected": -637.9187622070312, "loss": 0.5358, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1263809204101562, "rewards/margins": 0.9405500292778015, "rewards/rejected": -4.066930770874023, "step": 5560 }, { "epoch": 0.73, "grad_norm": 14.8125, "learning_rate": 1.0387721475595978e-06, "logits/chosen": -0.22912903130054474, "logits/rejected": 0.9436883926391602, "logps/chosen": -584.5687255859375, "logps/rejected": -707.6832275390625, "loss": 0.4363, "rewards/accuracies": 0.75, "rewards/chosen": -3.001406192779541, "rewards/margins": 1.3768746852874756, "rewards/rejected": -4.378281116485596, "step": 5570 }, { "epoch": 0.73, "grad_norm": 14.25, "learning_rate": 1.0295207097678378e-06, "logits/chosen": 0.14958836138248444, "logits/rejected": 1.2157787084579468, "logps/chosen": -631.9813842773438, "logps/rejected": -706.6419677734375, "loss": 0.4429, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2564949989318848, "rewards/margins": 1.3660955429077148, "rewards/rejected": -4.622590065002441, "step": 5580 }, { "epoch": 0.73, "grad_norm": 11.875, "learning_rate": 1.0202999593590924e-06, "logits/chosen": 0.5636407732963562, "logits/rejected": 1.1310765743255615, "logps/chosen": -582.2732543945312, "logps/rejected": -701.7555541992188, "loss": 0.4697, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.436253309249878, "rewards/margins": 1.2652666568756104, "rewards/rejected": -4.7015204429626465, "step": 5590 }, { "epoch": 0.73, "grad_norm": 20.875, "learning_rate": 1.011110088760891e-06, "logits/chosen": 0.11347509920597076, "logits/rejected": 1.787253737449646, "logps/chosen": -631.408203125, "logps/rejected": -739.8702392578125, "loss": 0.4249, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.302558183670044, "rewards/margins": 1.6660197973251343, "rewards/rejected": -4.968577861785889, "step": 5600 }, { "epoch": 0.73, "eval_logits/chosen": 0.9557819366455078, "eval_logits/rejected": 1.7110285758972168, "eval_logps/chosen": -599.0980224609375, "eval_logps/rejected": -712.3563842773438, "eval_loss": 0.4918125569820404, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -3.3447697162628174, "eval_rewards/margins": 1.3330047130584717, "eval_rewards/rejected": -4.677773952484131, "eval_runtime": 1592.4673, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 5600 }, { "epoch": 0.73, "grad_norm": 7.78125, "learning_rate": 1.0019512897563347e-06, "logits/chosen": 0.25357693433761597, "logits/rejected": 1.0136306285858154, "logps/chosen": -611.2633666992188, "logps/rejected": -688.2657470703125, "loss": 0.6093, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4056782722473145, "rewards/margins": 1.086458444595337, "rewards/rejected": -4.492136001586914, "step": 5610 }, { "epoch": 0.74, "grad_norm": 23.0, "learning_rate": 9.928237534800935e-07, "logits/chosen": 0.1514512449502945, "logits/rejected": 1.30339777469635, "logps/chosen": -624.1841430664062, "logps/rejected": -706.040283203125, "loss": 0.4836, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.171689748764038, "rewards/margins": 1.3224647045135498, "rewards/rejected": -4.494154453277588, "step": 5620 }, { "epoch": 0.74, "grad_norm": 14.6875, "learning_rate": 9.837276704144174e-07, "logits/chosen": 0.6703234314918518, "logits/rejected": 1.3387033939361572, "logps/chosen": -538.6806640625, "logps/rejected": -719.9829711914062, "loss": 0.4899, "rewards/accuracies": 0.75, "rewards/chosen": -3.253155469894409, "rewards/margins": 1.5623925924301147, "rewards/rejected": -4.815548419952393, "step": 5630 }, { "epoch": 0.74, "grad_norm": 19.125, "learning_rate": 9.746632303851569e-07, "logits/chosen": 0.35827261209487915, "logits/rejected": 1.1380624771118164, "logps/chosen": -574.8941650390625, "logps/rejected": -700.0415649414062, "loss": 0.4769, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.280141830444336, "rewards/margins": 1.4382925033569336, "rewards/rejected": -4.7184343338012695, "step": 5640 }, { "epoch": 0.74, "grad_norm": 19.625, "learning_rate": 9.65630622557809e-07, "logits/chosen": 0.5788121223449707, "logits/rejected": 0.6934975385665894, "logps/chosen": -583.9860229492188, "logps/rejected": -676.6206665039062, "loss": 0.5894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.547583818435669, "rewards/margins": 0.9842156171798706, "rewards/rejected": -4.531800270080566, "step": 5650 }, { "epoch": 0.74, "grad_norm": 16.75, "learning_rate": 9.56630035433561e-07, "logits/chosen": 0.36284205317497253, "logits/rejected": 1.3190720081329346, "logps/chosen": -594.0944213867188, "logps/rejected": -695.3111572265625, "loss": 0.5701, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.2505619525909424, "rewards/margins": 1.1941272020339966, "rewards/rejected": -4.4446892738342285, "step": 5660 }, { "epoch": 0.74, "grad_norm": 13.0, "learning_rate": 9.476616568453659e-07, "logits/chosen": 0.47623515129089355, "logits/rejected": 1.598987340927124, "logps/chosen": -607.935546875, "logps/rejected": -715.621826171875, "loss": 0.5532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.381014347076416, "rewards/margins": 1.5085983276367188, "rewards/rejected": -4.889612674713135, "step": 5670 }, { "epoch": 0.74, "grad_norm": 33.0, "learning_rate": 9.387256739540162e-07, "logits/chosen": 0.35760414600372314, "logits/rejected": 1.0268436670303345, "logps/chosen": -598.7515869140625, "logps/rejected": -728.8211669921875, "loss": 0.5329, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.3422341346740723, "rewards/margins": 1.1946098804473877, "rewards/rejected": -4.536844253540039, "step": 5680 }, { "epoch": 0.74, "grad_norm": 15.6875, "learning_rate": 9.298222732442377e-07, "logits/chosen": 0.9866177439689636, "logits/rejected": 1.629563570022583, "logps/chosen": -525.8026123046875, "logps/rejected": -634.7586669921875, "loss": 0.5319, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.24723482131958, "rewards/margins": 1.033001184463501, "rewards/rejected": -4.28023624420166, "step": 5690 }, { "epoch": 0.75, "grad_norm": 15.8125, "learning_rate": 9.20951640520803e-07, "logits/chosen": 0.5580320954322815, "logits/rejected": 1.2365334033966064, "logps/chosen": -517.0693359375, "logps/rejected": -630.359619140625, "loss": 0.5457, "rewards/accuracies": 0.6875, "rewards/chosen": -3.20147705078125, "rewards/margins": 1.087882161140442, "rewards/rejected": -4.289359092712402, "step": 5700 }, { "epoch": 0.75, "eval_logits/chosen": 0.9922085404396057, "eval_logits/rejected": 1.7372267246246338, "eval_logps/chosen": -592.4562377929688, "eval_logps/rejected": -701.9876708984375, "eval_loss": 0.4897419512271881, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -3.278352737426758, "eval_rewards/margins": 1.295735239982605, "eval_rewards/rejected": -4.574087619781494, "eval_runtime": 1592.5991, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 5700 }, { "epoch": 0.75, "grad_norm": 17.75, "learning_rate": 9.121139609046484e-07, "logits/chosen": 0.1699611395597458, "logits/rejected": 0.9091424942016602, "logps/chosen": -586.0159301757812, "logps/rejected": -743.6968383789062, "loss": 0.5009, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4288792610168457, "rewards/margins": 1.5915720462799072, "rewards/rejected": -5.020451545715332, "step": 5710 }, { "epoch": 0.75, "grad_norm": 12.375, "learning_rate": 9.033094188290121e-07, "logits/chosen": 0.05250464752316475, "logits/rejected": 0.9846289753913879, "logps/chosen": -579.5111083984375, "logps/rejected": -695.6873168945312, "loss": 0.4425, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.0725390911102295, "rewards/margins": 1.472003698348999, "rewards/rejected": -4.5445427894592285, "step": 5720 }, { "epoch": 0.75, "grad_norm": 19.25, "learning_rate": 8.945381980355889e-07, "logits/chosen": 0.48727503418922424, "logits/rejected": 1.2392418384552002, "logps/chosen": -584.3572998046875, "logps/rejected": -646.3514404296875, "loss": 0.5468, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0813841819763184, "rewards/margins": 1.3014500141143799, "rewards/rejected": -4.382834434509277, "step": 5730 }, { "epoch": 0.75, "grad_norm": 6.75, "learning_rate": 8.858004815706919e-07, "logits/chosen": 0.1299765259027481, "logits/rejected": 1.4009642601013184, "logps/chosen": -597.6690063476562, "logps/rejected": -718.248046875, "loss": 0.4349, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3065361976623535, "rewards/margins": 1.5114755630493164, "rewards/rejected": -4.81801176071167, "step": 5740 }, { "epoch": 0.75, "grad_norm": 7.53125, "learning_rate": 8.77096451781432e-07, "logits/chosen": 0.08577007800340652, "logits/rejected": 1.0180623531341553, "logps/chosen": -612.34912109375, "logps/rejected": -712.0511474609375, "loss": 0.4807, "rewards/accuracies": 0.75, "rewards/chosen": -3.2083351612091064, "rewards/margins": 1.256860613822937, "rewards/rejected": -4.465196132659912, "step": 5750 }, { "epoch": 0.75, "grad_norm": 11.875, "learning_rate": 8.684262903119165e-07, "logits/chosen": 0.5749713778495789, "logits/rejected": 1.654703140258789, "logps/chosen": -603.7852172851562, "logps/rejected": -712.9544677734375, "loss": 0.3899, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.355541944503784, "rewards/margins": 1.4487149715423584, "rewards/rejected": -4.804256916046143, "step": 5760 }, { "epoch": 0.76, "grad_norm": 13.0625, "learning_rate": 8.597901780994525e-07, "logits/chosen": 0.9888712167739868, "logits/rejected": 1.534705400466919, "logps/chosen": -561.2703857421875, "logps/rejected": -702.0836791992188, "loss": 0.455, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1287243366241455, "rewards/margins": 1.4464129209518433, "rewards/rejected": -4.575137138366699, "step": 5770 }, { "epoch": 0.76, "grad_norm": 7.375, "learning_rate": 8.511882953707773e-07, "logits/chosen": 0.46083322167396545, "logits/rejected": 1.4080815315246582, "logps/chosen": -536.21533203125, "logps/rejected": -711.8817749023438, "loss": 0.4216, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1101584434509277, "rewards/margins": 1.7463359832763672, "rewards/rejected": -4.856494426727295, "step": 5780 }, { "epoch": 0.76, "grad_norm": 19.0, "learning_rate": 8.426208216382944e-07, "logits/chosen": 0.6455115079879761, "logits/rejected": 1.258669137954712, "logps/chosen": -593.31640625, "logps/rejected": -674.8057861328125, "loss": 0.469, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.150735378265381, "rewards/margins": 1.3739995956420898, "rewards/rejected": -4.524735450744629, "step": 5790 }, { "epoch": 0.76, "grad_norm": 17.25, "learning_rate": 8.340879356963245e-07, "logits/chosen": 0.442685604095459, "logits/rejected": 1.396315336227417, "logps/chosen": -635.2542114257812, "logps/rejected": -683.7772827148438, "loss": 0.5287, "rewards/accuracies": 0.6875, "rewards/chosen": -3.358551025390625, "rewards/margins": 1.11667799949646, "rewards/rejected": -4.475229263305664, "step": 5800 }, { "epoch": 0.76, "eval_logits/chosen": 1.2037140130996704, "eval_logits/rejected": 1.9801844358444214, "eval_logps/chosen": -596.2890014648438, "eval_logps/rejected": -710.5777587890625, "eval_loss": 0.4920007884502411, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -3.3166792392730713, "eval_rewards/margins": 1.343308448791504, "eval_rewards/rejected": -4.659987926483154, "eval_runtime": 1591.929, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 5800 }, { "epoch": 0.76, "grad_norm": 26.375, "learning_rate": 8.255898156173777e-07, "logits/chosen": 0.31752336025238037, "logits/rejected": 1.4020841121673584, "logps/chosen": -596.1682739257812, "logps/rejected": -698.3563232421875, "loss": 0.585, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.314462184906006, "rewards/margins": 1.2807135581970215, "rewards/rejected": -4.5951762199401855, "step": 5810 }, { "epoch": 0.76, "grad_norm": 12.9375, "learning_rate": 8.171266387484389e-07, "logits/chosen": 0.34116196632385254, "logits/rejected": 1.6084121465682983, "logps/chosen": -616.3291625976562, "logps/rejected": -757.4080810546875, "loss": 0.5309, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6172256469726562, "rewards/margins": 1.4249513149261475, "rewards/rejected": -5.042177200317383, "step": 5820 }, { "epoch": 0.76, "grad_norm": 9.25, "learning_rate": 8.086985817072604e-07, "logits/chosen": 0.22463031113147736, "logits/rejected": 1.289733648300171, "logps/chosen": -579.5349731445312, "logps/rejected": -716.548095703125, "loss": 0.5499, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.202744960784912, "rewards/margins": 1.1577074527740479, "rewards/rejected": -4.360452175140381, "step": 5830 }, { "epoch": 0.76, "grad_norm": 9.75, "learning_rate": 8.003058203786835e-07, "logits/chosen": 0.5225099325180054, "logits/rejected": 1.9025760889053345, "logps/chosen": -554.4903564453125, "logps/rejected": -697.1652221679688, "loss": 0.3968, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1905171871185303, "rewards/margins": 1.6949659585952759, "rewards/rejected": -4.885483264923096, "step": 5840 }, { "epoch": 0.77, "grad_norm": 22.5, "learning_rate": 7.91948529910963e-07, "logits/chosen": 0.3707103729248047, "logits/rejected": 1.3207480907440186, "logps/chosen": -581.2924194335938, "logps/rejected": -676.443603515625, "loss": 0.4742, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2187423706054688, "rewards/margins": 1.3398678302764893, "rewards/rejected": -4.558609962463379, "step": 5850 }, { "epoch": 0.77, "grad_norm": 7.1875, "learning_rate": 7.836268847121126e-07, "logits/chosen": 0.5717436075210571, "logits/rejected": 1.3999663591384888, "logps/chosen": -559.6635131835938, "logps/rejected": -694.9573974609375, "loss": 0.4302, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0068411827087402, "rewards/margins": 1.5318539142608643, "rewards/rejected": -4.538695335388184, "step": 5860 }, { "epoch": 0.77, "grad_norm": 10.5, "learning_rate": 7.753410584462681e-07, "logits/chosen": 0.04864966869354248, "logits/rejected": 1.1614755392074585, "logps/chosen": -604.5387573242188, "logps/rejected": -691.7279052734375, "loss": 0.4828, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.178351879119873, "rewards/margins": 1.2994771003723145, "rewards/rejected": -4.477828502655029, "step": 5870 }, { "epoch": 0.77, "grad_norm": 9.3125, "learning_rate": 7.670912240300596e-07, "logits/chosen": -0.06198643520474434, "logits/rejected": 1.4561761617660522, "logps/chosen": -610.7146606445312, "logps/rejected": -719.8155517578125, "loss": 0.3941, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0767502784729004, "rewards/margins": 1.5548076629638672, "rewards/rejected": -4.631557464599609, "step": 5880 }, { "epoch": 0.77, "grad_norm": 11.3125, "learning_rate": 7.588775536290035e-07, "logits/chosen": 0.4458466172218323, "logits/rejected": 1.3329976797103882, "logps/chosen": -572.161865234375, "logps/rejected": -688.1071166992188, "loss": 0.3865, "rewards/accuracies": 0.8125, "rewards/chosen": -3.103865146636963, "rewards/margins": 1.4781702756881714, "rewards/rejected": -4.582036018371582, "step": 5890 }, { "epoch": 0.77, "grad_norm": 8.9375, "learning_rate": 7.507002186539147e-07, "logits/chosen": 0.36371809244155884, "logits/rejected": 0.9052112698554993, "logps/chosen": -596.0297241210938, "logps/rejected": -702.3975830078125, "loss": 0.5286, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.279501438140869, "rewards/margins": 1.1486151218414307, "rewards/rejected": -4.428115367889404, "step": 5900 }, { "epoch": 0.77, "eval_logits/chosen": 1.13614821434021, "eval_logits/rejected": 1.9038183689117432, "eval_logps/chosen": -587.6722412109375, "eval_logps/rejected": -701.1276245117188, "eval_loss": 0.4918730854988098, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -3.2305116653442383, "eval_rewards/margins": 1.3349748849868774, "eval_rewards/rejected": -4.565486907958984, "eval_runtime": 1591.0126, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 5900 }, { "epoch": 0.77, "grad_norm": 9.5, "learning_rate": 7.425593897573216e-07, "logits/chosen": 0.2484653890132904, "logits/rejected": 1.1581590175628662, "logps/chosen": -580.0093994140625, "logps/rejected": -667.7286376953125, "loss": 0.4072, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.056926727294922, "rewards/margins": 1.5210171937942505, "rewards/rejected": -4.577943325042725, "step": 5910 }, { "epoch": 0.77, "grad_norm": 14.375, "learning_rate": 7.344552368299088e-07, "logits/chosen": 0.7004015445709229, "logits/rejected": 1.145263433456421, "logps/chosen": -525.1219482421875, "logps/rejected": -703.9129638671875, "loss": 0.432, "rewards/accuracies": 0.75, "rewards/chosen": -3.113039493560791, "rewards/margins": 1.5690702199935913, "rewards/rejected": -4.682109832763672, "step": 5920 }, { "epoch": 0.78, "grad_norm": 21.0, "learning_rate": 7.26387928996973e-07, "logits/chosen": -0.07604257762432098, "logits/rejected": 1.174023985862732, "logps/chosen": -568.1380615234375, "logps/rejected": -627.6046142578125, "loss": 0.5232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1161561012268066, "rewards/margins": 1.02237868309021, "rewards/rejected": -4.1385345458984375, "step": 5930 }, { "epoch": 0.78, "grad_norm": 20.25, "learning_rate": 7.183576346148899e-07, "logits/chosen": 0.3111092746257782, "logits/rejected": 1.6442598104476929, "logps/chosen": -615.7824096679688, "logps/rejected": -697.6546020507812, "loss": 0.4505, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.264331102371216, "rewards/margins": 1.4083982706069946, "rewards/rejected": -4.672729015350342, "step": 5940 }, { "epoch": 0.78, "grad_norm": 15.625, "learning_rate": 7.103645212676044e-07, "logits/chosen": 0.3181416392326355, "logits/rejected": 1.6206680536270142, "logps/chosen": -572.1453247070312, "logps/rejected": -702.2391967773438, "loss": 0.4264, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0080885887145996, "rewards/margins": 1.6056219339370728, "rewards/rejected": -4.613710403442383, "step": 5950 }, { "epoch": 0.78, "grad_norm": 26.125, "learning_rate": 7.024087557631318e-07, "logits/chosen": 0.088649682700634, "logits/rejected": 1.1377710103988647, "logps/chosen": -626.9284057617188, "logps/rejected": -731.088134765625, "loss": 0.4376, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1198205947875977, "rewards/margins": 1.4172180891036987, "rewards/rejected": -4.537038326263428, "step": 5960 }, { "epoch": 0.78, "grad_norm": 11.3125, "learning_rate": 6.944905041300739e-07, "logits/chosen": 0.10835651308298111, "logits/rejected": 0.9804534912109375, "logps/chosen": -591.5462646484375, "logps/rejected": -722.29150390625, "loss": 0.4895, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1032094955444336, "rewards/margins": 1.463866949081421, "rewards/rejected": -4.567076206207275, "step": 5970 }, { "epoch": 0.78, "grad_norm": 11.5, "learning_rate": 6.866099316141606e-07, "logits/chosen": 0.04711395502090454, "logits/rejected": 0.4793902337551117, "logps/chosen": -625.3909912109375, "logps/rejected": -680.4255981445312, "loss": 0.5642, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2621428966522217, "rewards/margins": 1.0643290281295776, "rewards/rejected": -4.32647180557251, "step": 5980 }, { "epoch": 0.78, "grad_norm": 8.0, "learning_rate": 6.787672026747946e-07, "logits/chosen": 0.6254671216011047, "logits/rejected": 2.311389923095703, "logps/chosen": -620.357666015625, "logps/rejected": -669.0045166015625, "loss": 0.5704, "rewards/accuracies": 0.6875, "rewards/chosen": -3.432579517364502, "rewards/margins": 1.1018991470336914, "rewards/rejected": -4.534478664398193, "step": 5990 }, { "epoch": 0.79, "grad_norm": 17.625, "learning_rate": 6.709624809816223e-07, "logits/chosen": 0.38161593675613403, "logits/rejected": 1.1075352430343628, "logps/chosen": -637.8136596679688, "logps/rejected": -724.2991943359375, "loss": 0.5147, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.4568965435028076, "rewards/margins": 1.2378437519073486, "rewards/rejected": -4.694740295410156, "step": 6000 }, { "epoch": 0.79, "eval_logits/chosen": 1.1726433038711548, "eval_logits/rejected": 1.9302729368209839, "eval_logps/chosen": -596.0759887695312, "eval_logps/rejected": -708.931884765625, "eval_loss": 0.49101775884628296, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -3.314549207687378, "eval_rewards/margins": 1.3289803266525269, "eval_rewards/rejected": -4.643529415130615, "eval_runtime": 1590.6827, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 6000 }, { "epoch": 0.79, "grad_norm": 9.8125, "learning_rate": 6.6319592941112e-07, "logits/chosen": -0.11990109831094742, "logits/rejected": 1.020079493522644, "logps/chosen": -551.75439453125, "logps/rejected": -731.1846923828125, "loss": 0.4526, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1442012786865234, "rewards/margins": 1.6808693408966064, "rewards/rejected": -4.825070381164551, "step": 6010 }, { "epoch": 0.79, "grad_norm": 18.125, "learning_rate": 6.554677100431927e-07, "logits/chosen": 0.13416549563407898, "logits/rejected": 1.3785282373428345, "logps/chosen": -633.962890625, "logps/rejected": -651.944580078125, "loss": 0.5893, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3228306770324707, "rewards/margins": 0.920928955078125, "rewards/rejected": -4.243759632110596, "step": 6020 }, { "epoch": 0.79, "grad_norm": 21.5, "learning_rate": 6.4777798415779e-07, "logits/chosen": 0.44740551710128784, "logits/rejected": 1.2276852130889893, "logps/chosen": -571.357421875, "logps/rejected": -697.7033081054688, "loss": 0.5064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.125563144683838, "rewards/margins": 1.19227135181427, "rewards/rejected": -4.317834377288818, "step": 6030 }, { "epoch": 0.79, "grad_norm": 11.875, "learning_rate": 6.401269122315451e-07, "logits/chosen": 0.5647193193435669, "logits/rejected": 0.9143550992012024, "logps/chosen": -606.7120971679688, "logps/rejected": -702.9698486328125, "loss": 0.4996, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.2641494274139404, "rewards/margins": 1.107541799545288, "rewards/rejected": -4.371691703796387, "step": 6040 }, { "epoch": 0.79, "grad_norm": 12.9375, "learning_rate": 6.325146539344196e-07, "logits/chosen": 0.36607789993286133, "logits/rejected": 0.7721070051193237, "logps/chosen": -600.3197631835938, "logps/rejected": -734.3776245117188, "loss": 0.5224, "rewards/accuracies": 0.75, "rewards/chosen": -3.335095167160034, "rewards/margins": 1.2280336618423462, "rewards/rejected": -4.56312894821167, "step": 6050 }, { "epoch": 0.79, "grad_norm": 18.75, "learning_rate": 6.249413681263782e-07, "logits/chosen": 0.6674261093139648, "logits/rejected": 1.2911916971206665, "logps/chosen": -580.8428955078125, "logps/rejected": -668.2259521484375, "loss": 0.4819, "rewards/accuracies": 0.75, "rewards/chosen": -3.283857822418213, "rewards/margins": 1.2953107357025146, "rewards/rejected": -4.57916784286499, "step": 6060 }, { "epoch": 0.79, "grad_norm": 10.8125, "learning_rate": 6.174072128540686e-07, "logits/chosen": 0.3818279802799225, "logits/rejected": 1.0219361782073975, "logps/chosen": -553.1798095703125, "logps/rejected": -647.6849365234375, "loss": 0.4719, "rewards/accuracies": 0.8125, "rewards/chosen": -3.125274419784546, "rewards/margins": 1.3637750148773193, "rewards/rejected": -4.489049434661865, "step": 6070 }, { "epoch": 0.8, "grad_norm": 20.625, "learning_rate": 6.099123453475245e-07, "logits/chosen": -0.03337569907307625, "logits/rejected": 0.600936770439148, "logps/chosen": -599.7701416015625, "logps/rejected": -710.7029418945312, "loss": 0.5543, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.229990005493164, "rewards/margins": 1.0938842296600342, "rewards/rejected": -4.323873996734619, "step": 6080 }, { "epoch": 0.8, "grad_norm": 14.1875, "learning_rate": 6.024569220168836e-07, "logits/chosen": 0.9164674878120422, "logits/rejected": 1.220346450805664, "logps/chosen": -546.9329833984375, "logps/rejected": -662.7405395507812, "loss": 0.5744, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2453300952911377, "rewards/margins": 1.0616782903671265, "rewards/rejected": -4.307008266448975, "step": 6090 }, { "epoch": 0.8, "grad_norm": 13.875, "learning_rate": 5.950410984491268e-07, "logits/chosen": 0.7717280387878418, "logits/rejected": 1.1919779777526855, "logps/chosen": -580.9427490234375, "logps/rejected": -707.2449951171875, "loss": 0.4478, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.240947723388672, "rewards/margins": 1.3304286003112793, "rewards/rejected": -4.571376323699951, "step": 6100 }, { "epoch": 0.8, "eval_logits/chosen": 1.0185655355453491, "eval_logits/rejected": 1.7621047496795654, "eval_logps/chosen": -585.310546875, "eval_logps/rejected": -694.713134765625, "eval_loss": 0.4886416494846344, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -3.206895589828491, "eval_rewards/margins": 1.2944468259811401, "eval_rewards/rejected": -4.501342296600342, "eval_runtime": 1591.1979, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 6100 }, { "epoch": 0.8, "grad_norm": 7.75, "learning_rate": 5.876650294048262e-07, "logits/chosen": -0.022813748568296432, "logits/rejected": 0.9088582992553711, "logps/chosen": -578.6947021484375, "logps/rejected": -691.7832641601562, "loss": 0.4224, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.993137836456299, "rewards/margins": 1.3222205638885498, "rewards/rejected": -4.315358638763428, "step": 6110 }, { "epoch": 0.8, "grad_norm": 12.6875, "learning_rate": 5.8032886881492e-07, "logits/chosen": 0.0022028745152056217, "logits/rejected": 1.4448870420455933, "logps/chosen": -580.8271484375, "logps/rejected": -657.9542236328125, "loss": 0.4356, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1111884117126465, "rewards/margins": 1.3057067394256592, "rewards/rejected": -4.416895389556885, "step": 6120 }, { "epoch": 0.8, "grad_norm": 5.96875, "learning_rate": 5.730327697774988e-07, "logits/chosen": 0.32467782497406006, "logits/rejected": 0.7888936996459961, "logps/chosen": -599.9251708984375, "logps/rejected": -760.7127685546875, "loss": 0.434, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1381869316101074, "rewards/margins": 1.4439395666122437, "rewards/rejected": -4.582126617431641, "step": 6130 }, { "epoch": 0.8, "grad_norm": 13.625, "learning_rate": 5.657768845546068e-07, "logits/chosen": 0.8671263456344604, "logits/rejected": 1.679828405380249, "logps/chosen": -563.6256103515625, "logps/rejected": -678.2106323242188, "loss": 0.5529, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3380939960479736, "rewards/margins": 1.1758853197097778, "rewards/rejected": -4.513979434967041, "step": 6140 }, { "epoch": 0.8, "grad_norm": 15.125, "learning_rate": 5.585613645690713e-07, "logits/chosen": -0.0506153330206871, "logits/rejected": 0.853299617767334, "logps/chosen": -596.2897338867188, "logps/rejected": -728.8367309570312, "loss": 0.447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2465102672576904, "rewards/margins": 1.3228228092193604, "rewards/rejected": -4.569333076477051, "step": 6150 }, { "epoch": 0.81, "grad_norm": 10.625, "learning_rate": 5.513863604013355e-07, "logits/chosen": 0.5937899351119995, "logits/rejected": 1.3581970930099487, "logps/chosen": -574.1300659179688, "logps/rejected": -655.2301025390625, "loss": 0.4973, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2126641273498535, "rewards/margins": 1.2130138874053955, "rewards/rejected": -4.42567777633667, "step": 6160 }, { "epoch": 0.81, "grad_norm": 15.9375, "learning_rate": 5.442520217863215e-07, "logits/chosen": 0.37454861402511597, "logits/rejected": 1.0216349363327026, "logps/chosen": -633.1695556640625, "logps/rejected": -760.712890625, "loss": 0.4117, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1701767444610596, "rewards/margins": 1.5251020193099976, "rewards/rejected": -4.695279121398926, "step": 6170 }, { "epoch": 0.81, "grad_norm": 37.5, "learning_rate": 5.371584976103034e-07, "logits/chosen": 0.19896648824214935, "logits/rejected": 0.8611049652099609, "logps/chosen": -606.3096313476562, "logps/rejected": -702.2210083007812, "loss": 0.499, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.2889304161071777, "rewards/margins": 1.151632308959961, "rewards/rejected": -4.440562725067139, "step": 6180 }, { "epoch": 0.81, "grad_norm": 9.8125, "learning_rate": 5.301059359077987e-07, "logits/chosen": 0.5175013542175293, "logits/rejected": 1.247930645942688, "logps/chosen": -570.9851684570312, "logps/rejected": -701.9575805664062, "loss": 0.4799, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3489022254943848, "rewards/margins": 1.3883564472198486, "rewards/rejected": -4.737257957458496, "step": 6190 }, { "epoch": 0.81, "grad_norm": 23.5, "learning_rate": 5.230944838584806e-07, "logits/chosen": 0.49456173181533813, "logits/rejected": 0.8041893243789673, "logps/chosen": -576.3448486328125, "logps/rejected": -717.2078247070312, "loss": 0.5236, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.191760540008545, "rewards/margins": 1.3665167093276978, "rewards/rejected": -4.558276653289795, "step": 6200 }, { "epoch": 0.81, "eval_logits/chosen": 1.0793569087982178, "eval_logits/rejected": 1.8309375047683716, "eval_logps/chosen": -596.6957397460938, "eval_logps/rejected": -709.5498657226562, "eval_loss": 0.4901345372200012, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -3.320746898651123, "eval_rewards/margins": 1.3289613723754883, "eval_rewards/rejected": -4.6497087478637695, "eval_runtime": 1591.779, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 6200 }, { "epoch": 0.81, "grad_norm": 12.5, "learning_rate": 5.161242877841083e-07, "logits/chosen": 0.840091347694397, "logits/rejected": 1.9746043682098389, "logps/chosen": -618.3326416015625, "logps/rejected": -682.1724853515625, "loss": 0.5775, "rewards/accuracies": 0.6875, "rewards/chosen": -3.6712398529052734, "rewards/margins": 1.017067790031433, "rewards/rejected": -4.688307762145996, "step": 6210 }, { "epoch": 0.81, "grad_norm": 27.125, "learning_rate": 5.091954931454682e-07, "logits/chosen": 0.28132981061935425, "logits/rejected": 1.3453963994979858, "logps/chosen": -594.2634887695312, "logps/rejected": -707.8575439453125, "loss": 0.529, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3353753089904785, "rewards/margins": 1.407685399055481, "rewards/rejected": -4.743060111999512, "step": 6220 }, { "epoch": 0.82, "grad_norm": 6.59375, "learning_rate": 5.023082445393446e-07, "logits/chosen": 0.30338960886001587, "logits/rejected": 0.6292451024055481, "logps/chosen": -613.8154296875, "logps/rejected": -710.1831665039062, "loss": 0.476, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.403163433074951, "rewards/margins": 1.109538197517395, "rewards/rejected": -4.512701988220215, "step": 6230 }, { "epoch": 0.82, "grad_norm": 15.3125, "learning_rate": 4.95462685695498e-07, "logits/chosen": 0.29414045810699463, "logits/rejected": 0.9581305384635925, "logps/chosen": -574.3023681640625, "logps/rejected": -708.0538940429688, "loss": 0.4095, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.2103676795959473, "rewards/margins": 1.4833682775497437, "rewards/rejected": -4.6937360763549805, "step": 6240 }, { "epoch": 0.82, "grad_norm": 6.4375, "learning_rate": 4.88658959473666e-07, "logits/chosen": -0.03567713499069214, "logits/rejected": 0.8967617154121399, "logps/chosen": -583.6680908203125, "logps/rejected": -677.9716186523438, "loss": 0.4951, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.315932035446167, "rewards/margins": 1.201798439025879, "rewards/rejected": -4.517730236053467, "step": 6250 }, { "epoch": 0.82, "grad_norm": 11.0625, "learning_rate": 4.818972078605821e-07, "logits/chosen": 0.3524642586708069, "logits/rejected": 1.6111385822296143, "logps/chosen": -596.5489501953125, "logps/rejected": -727.8389892578125, "loss": 0.4802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2479538917541504, "rewards/margins": 1.6509485244750977, "rewards/rejected": -4.898902416229248, "step": 6260 }, { "epoch": 0.82, "grad_norm": 8.4375, "learning_rate": 4.7517757196701514e-07, "logits/chosen": 0.23523840308189392, "logits/rejected": 1.2825753688812256, "logps/chosen": -628.8624267578125, "logps/rejected": -738.5863037109375, "loss": 0.4968, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.5499215126037598, "rewards/margins": 1.364435076713562, "rewards/rejected": -4.914356708526611, "step": 6270 }, { "epoch": 0.82, "grad_norm": 15.0625, "learning_rate": 4.6850019202482193e-07, "logits/chosen": 0.2345297634601593, "logits/rejected": 1.5206564664840698, "logps/chosen": -624.873046875, "logps/rejected": -731.8216552734375, "loss": 0.4436, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4002845287323, "rewards/margins": 1.4457454681396484, "rewards/rejected": -4.846030235290527, "step": 6280 }, { "epoch": 0.82, "grad_norm": 7.9375, "learning_rate": 4.618652073840188e-07, "logits/chosen": 0.2798255681991577, "logits/rejected": 1.280543327331543, "logps/chosen": -546.3389282226562, "logps/rejected": -665.4715576171875, "loss": 0.476, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.009049654006958, "rewards/margins": 1.4991285800933838, "rewards/rejected": -4.508177757263184, "step": 6290 }, { "epoch": 0.82, "grad_norm": 8.5, "learning_rate": 4.5527275650987965e-07, "logits/chosen": 0.2119041234254837, "logits/rejected": 0.7161015272140503, "logps/chosen": -608.0989990234375, "logps/rejected": -738.2981567382812, "loss": 0.5079, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.027043581008911, "rewards/margins": 1.2831947803497314, "rewards/rejected": -4.310237884521484, "step": 6300 }, { "epoch": 0.82, "eval_logits/chosen": 1.032199501991272, "eval_logits/rejected": 1.7747009992599487, "eval_logps/chosen": -595.4583129882812, "eval_logps/rejected": -706.781982421875, "eval_loss": 0.4889962375164032, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -3.3083724975585938, "eval_rewards/margins": 1.3136582374572754, "eval_rewards/rejected": -4.622030735015869, "eval_runtime": 1592.0101, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 6300 }, { "epoch": 0.83, "grad_norm": 11.75, "learning_rate": 4.487229769800394e-07, "logits/chosen": 0.3432023823261261, "logits/rejected": 0.8247334361076355, "logps/chosen": -597.5015869140625, "logps/rejected": -651.3562622070312, "loss": 0.5573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.2687923908233643, "rewards/margins": 0.9014599919319153, "rewards/rejected": -4.170252323150635, "step": 6310 }, { "epoch": 0.83, "grad_norm": 29.0, "learning_rate": 4.422160054816285e-07, "logits/chosen": 0.20175309479236603, "logits/rejected": 0.72217857837677, "logps/chosen": -546.6419067382812, "logps/rejected": -679.1134033203125, "loss": 0.5044, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.018216371536255, "rewards/margins": 1.276477336883545, "rewards/rejected": -4.294693946838379, "step": 6320 }, { "epoch": 0.83, "grad_norm": 17.875, "learning_rate": 4.35751977808416e-07, "logits/chosen": 0.7091022729873657, "logits/rejected": 1.5161218643188477, "logps/chosen": -610.763427734375, "logps/rejected": -721.1217651367188, "loss": 0.4028, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.3477084636688232, "rewards/margins": 1.465746521949768, "rewards/rejected": -4.813455104827881, "step": 6330 }, { "epoch": 0.83, "grad_norm": 13.25, "learning_rate": 4.293310288579794e-07, "logits/chosen": 0.09544781595468521, "logits/rejected": 0.8159352540969849, "logps/chosen": -580.238037109375, "logps/rejected": -642.5474243164062, "loss": 0.5207, "rewards/accuracies": 0.6875, "rewards/chosen": -3.303569793701172, "rewards/margins": 0.9509817361831665, "rewards/rejected": -4.254551887512207, "step": 6340 }, { "epoch": 0.83, "grad_norm": 17.0, "learning_rate": 4.2295329262888733e-07, "logits/chosen": 0.020851727575063705, "logits/rejected": 1.0389108657836914, "logps/chosen": -595.221923828125, "logps/rejected": -701.2500610351562, "loss": 0.5483, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.224815845489502, "rewards/margins": 1.1890679597854614, "rewards/rejected": -4.413883686065674, "step": 6350 }, { "epoch": 0.83, "grad_norm": 54.75, "learning_rate": 4.1661890221790316e-07, "logits/chosen": 0.35152512788772583, "logits/rejected": 1.3631591796875, "logps/chosen": -667.9931030273438, "logps/rejected": -738.0748291015625, "loss": 0.6645, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.6574642658233643, "rewards/margins": 0.9594003558158875, "rewards/rejected": -4.616864204406738, "step": 6360 }, { "epoch": 0.83, "grad_norm": 15.375, "learning_rate": 4.103279898172072e-07, "logits/chosen": 0.49099016189575195, "logits/rejected": 0.9257776141166687, "logps/chosen": -560.7960205078125, "logps/rejected": -641.2603149414062, "loss": 0.5571, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.2241997718811035, "rewards/margins": 1.0578079223632812, "rewards/rejected": -4.282007694244385, "step": 6370 }, { "epoch": 0.83, "grad_norm": 8.5, "learning_rate": 4.040806867116401e-07, "logits/chosen": 0.4509049952030182, "logits/rejected": 1.1655193567276, "logps/chosen": -627.8912353515625, "logps/rejected": -751.0303955078125, "loss": 0.4506, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.246445417404175, "rewards/margins": 1.472038745880127, "rewards/rejected": -4.718484401702881, "step": 6380 }, { "epoch": 0.84, "grad_norm": 17.875, "learning_rate": 3.978771232759615e-07, "logits/chosen": 0.5065037608146667, "logits/rejected": 1.0776305198669434, "logps/chosen": -563.1023559570312, "logps/rejected": -674.8141479492188, "loss": 0.4742, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.238140821456909, "rewards/margins": 1.2237097024917603, "rewards/rejected": -4.461850643157959, "step": 6390 }, { "epoch": 0.84, "grad_norm": 13.875, "learning_rate": 3.917174289721276e-07, "logits/chosen": 0.5604764819145203, "logits/rejected": 0.7512072324752808, "logps/chosen": -558.1353759765625, "logps/rejected": -691.1140747070312, "loss": 0.4942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1084206104278564, "rewards/margins": 1.387665033340454, "rewards/rejected": -4.496085166931152, "step": 6400 }, { "epoch": 0.84, "eval_logits/chosen": 1.026822566986084, "eval_logits/rejected": 1.7716290950775146, "eval_logps/chosen": -590.8313598632812, "eval_logps/rejected": -701.301025390625, "eval_loss": 0.48912298679351807, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -3.26210355758667, "eval_rewards/margins": 1.3051165342330933, "eval_rewards/rejected": -4.5672197341918945, "eval_runtime": 1591.6194, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 6400 }, { "epoch": 0.84, "grad_norm": 25.0, "learning_rate": 3.856017323465938e-07, "logits/chosen": 0.08190663903951645, "logits/rejected": 0.6202396750450134, "logps/chosen": -553.3712768554688, "logps/rejected": -693.1614990234375, "loss": 0.4305, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.957423210144043, "rewards/margins": 1.4459336996078491, "rewards/rejected": -4.403356552124023, "step": 6410 }, { "epoch": 0.84, "grad_norm": 19.125, "learning_rate": 3.7953016102762695e-07, "logits/chosen": 0.3419317603111267, "logits/rejected": 1.0710508823394775, "logps/chosen": -599.0252685546875, "logps/rejected": -694.2352294921875, "loss": 0.5246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.2364730834960938, "rewards/margins": 1.1976615190505981, "rewards/rejected": -4.434134483337402, "step": 6420 }, { "epoch": 0.84, "grad_norm": 15.1875, "learning_rate": 3.7350284172264493e-07, "logits/chosen": 0.45977815985679626, "logits/rejected": 1.0664544105529785, "logps/chosen": -610.8163452148438, "logps/rejected": -696.8948974609375, "loss": 0.5006, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.2424583435058594, "rewards/margins": 1.174291968345642, "rewards/rejected": -4.416750431060791, "step": 6430 }, { "epoch": 0.84, "grad_norm": 8.875, "learning_rate": 3.67519900215573e-07, "logits/chosen": 0.2526703476905823, "logits/rejected": 1.0846112966537476, "logps/chosen": -596.3863525390625, "logps/rejected": -693.46875, "loss": 0.3881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.027198314666748, "rewards/margins": 1.4641358852386475, "rewards/rejected": -4.491333961486816, "step": 6440 }, { "epoch": 0.84, "grad_norm": 4.84375, "learning_rate": 3.615814613642174e-07, "logits/chosen": 0.18329405784606934, "logits/rejected": 0.8037030100822449, "logps/chosen": -567.7757568359375, "logps/rejected": -723.3435668945312, "loss": 0.4066, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0279901027679443, "rewards/margins": 1.6228275299072266, "rewards/rejected": -4.65081787109375, "step": 6450 }, { "epoch": 0.85, "grad_norm": 18.625, "learning_rate": 3.5568764909765795e-07, "logits/chosen": 0.4307138919830322, "logits/rejected": 1.433720350265503, "logps/chosen": -633.9922485351562, "logps/rejected": -681.5325317382812, "loss": 0.5682, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.525965929031372, "rewards/margins": 1.0324019193649292, "rewards/rejected": -4.558367729187012, "step": 6460 }, { "epoch": 0.85, "grad_norm": 7.78125, "learning_rate": 3.498385864136672e-07, "logits/chosen": 0.23667888343334198, "logits/rejected": 0.9870807528495789, "logps/chosen": -586.2492065429688, "logps/rejected": -757.7203369140625, "loss": 0.4709, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.298717975616455, "rewards/margins": 1.508500337600708, "rewards/rejected": -4.807218074798584, "step": 6470 }, { "epoch": 0.85, "grad_norm": 19.375, "learning_rate": 3.440343953761363e-07, "logits/chosen": 0.3215915858745575, "logits/rejected": 1.3016798496246338, "logps/chosen": -536.6361694335938, "logps/rejected": -679.41650390625, "loss": 0.3826, "rewards/accuracies": 0.75, "rewards/chosen": -2.9511194229125977, "rewards/margins": 1.6483557224273682, "rewards/rejected": -4.599474906921387, "step": 6480 }, { "epoch": 0.85, "grad_norm": 16.0, "learning_rate": 3.382751971125345e-07, "logits/chosen": 0.7153388261795044, "logits/rejected": 1.2328914403915405, "logps/chosen": -628.5037841796875, "logps/rejected": -717.3402099609375, "loss": 0.5905, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6130409240722656, "rewards/margins": 1.0365272760391235, "rewards/rejected": -4.649567604064941, "step": 6490 }, { "epoch": 0.85, "grad_norm": 10.0, "learning_rate": 3.3256111181137753e-07, "logits/chosen": 0.12050239741802216, "logits/rejected": 1.2598720788955688, "logps/chosen": -616.6473388671875, "logps/rejected": -675.8792724609375, "loss": 0.4688, "rewards/accuracies": 0.75, "rewards/chosen": -3.2907919883728027, "rewards/margins": 1.167119026184082, "rewards/rejected": -4.457911491394043, "step": 6500 }, { "epoch": 0.85, "eval_logits/chosen": 1.040233850479126, "eval_logits/rejected": 1.7862778902053833, "eval_logps/chosen": -593.2546997070312, "eval_logps/rejected": -704.1409912109375, "eval_loss": 0.48905226588249207, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -3.286336660385132, "eval_rewards/margins": 1.3092845678329468, "eval_rewards/rejected": -4.595621109008789, "eval_runtime": 1590.4062, "eval_samples_per_second": 1.258, "eval_steps_per_second": 0.314, "step": 6500 }, { "epoch": 0.85, "grad_norm": 17.0, "learning_rate": 3.2689225871971905e-07, "logits/chosen": 0.286967933177948, "logits/rejected": 1.4739683866500854, "logps/chosen": -553.6991577148438, "logps/rejected": -687.5084228515625, "loss": 0.4413, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2888290882110596, "rewards/margins": 1.5385186672210693, "rewards/rejected": -4.827347755432129, "step": 6510 }, { "epoch": 0.85, "grad_norm": 11.1875, "learning_rate": 3.2126875614066523e-07, "logits/chosen": -0.10929323732852936, "logits/rejected": 1.3522584438323975, "logps/chosen": -552.710205078125, "logps/rejected": -673.8087768554688, "loss": 0.477, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0392167568206787, "rewards/margins": 1.356864333152771, "rewards/rejected": -4.39608097076416, "step": 6520 }, { "epoch": 0.85, "grad_norm": 16.125, "learning_rate": 3.156907214309024e-07, "logits/chosen": 0.7336920499801636, "logits/rejected": 0.7236236333847046, "logps/chosen": -560.0294189453125, "logps/rejected": -728.3599243164062, "loss": 0.4632, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.27976655960083, "rewards/margins": 1.335707426071167, "rewards/rejected": -4.615473747253418, "step": 6530 }, { "epoch": 0.86, "grad_norm": 11.0625, "learning_rate": 3.1015827099824923e-07, "logits/chosen": 0.3031115233898163, "logits/rejected": 1.1659104824066162, "logps/chosen": -624.4817504882812, "logps/rejected": -751.3743896484375, "loss": 0.5015, "rewards/accuracies": 0.75, "rewards/chosen": -3.492037534713745, "rewards/margins": 1.4371260404586792, "rewards/rejected": -4.929163932800293, "step": 6540 }, { "epoch": 0.86, "grad_norm": 8.1875, "learning_rate": 3.0467152029922926e-07, "logits/chosen": 0.5154403448104858, "logits/rejected": 0.4819985032081604, "logps/chosen": -536.7814331054688, "logps/rejected": -670.2327270507812, "loss": 0.5557, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.159590482711792, "rewards/margins": 1.1023595333099365, "rewards/rejected": -4.2619500160217285, "step": 6550 }, { "epoch": 0.86, "grad_norm": 11.4375, "learning_rate": 2.992305838366591e-07, "logits/chosen": 0.2807716131210327, "logits/rejected": 1.1566855907440186, "logps/chosen": -624.9320678710938, "logps/rejected": -709.5386352539062, "loss": 0.4996, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.347169876098633, "rewards/margins": 1.311095952987671, "rewards/rejected": -4.658266067504883, "step": 6560 }, { "epoch": 0.86, "grad_norm": 11.625, "learning_rate": 2.938355751572583e-07, "logits/chosen": 0.4360644817352295, "logits/rejected": 1.5401076078414917, "logps/chosen": -575.8522338867188, "logps/rejected": -724.4762573242188, "loss": 0.402, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.2187793254852295, "rewards/margins": 1.7893708944320679, "rewards/rejected": -5.008150100708008, "step": 6570 }, { "epoch": 0.86, "grad_norm": 20.625, "learning_rate": 2.8848660684928307e-07, "logits/chosen": 0.6239246726036072, "logits/rejected": 1.2716379165649414, "logps/chosen": -583.64697265625, "logps/rejected": -739.271728515625, "loss": 0.3718, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.1440670490264893, "rewards/margins": 1.6428172588348389, "rewards/rejected": -4.786884307861328, "step": 6580 }, { "epoch": 0.86, "grad_norm": 11.375, "learning_rate": 2.8318379054017383e-07, "logits/chosen": 0.28017204999923706, "logits/rejected": 0.6756846904754639, "logps/chosen": -587.0732421875, "logps/rejected": -677.6159057617188, "loss": 0.6121, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.404522657394409, "rewards/margins": 0.8272771835327148, "rewards/rejected": -4.231800079345703, "step": 6590 }, { "epoch": 0.86, "grad_norm": 9.3125, "learning_rate": 2.779272368942246e-07, "logits/chosen": 0.1423681676387787, "logits/rejected": 1.035640001296997, "logps/chosen": -550.0023193359375, "logps/rejected": -636.7756958007812, "loss": 0.5062, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1119656562805176, "rewards/margins": 1.2890288829803467, "rewards/rejected": -4.400994300842285, "step": 6600 }, { "epoch": 0.86, "eval_logits/chosen": 1.0260632038116455, "eval_logits/rejected": 1.769521951675415, "eval_logps/chosen": -593.8478393554688, "eval_logps/rejected": -704.8690795898438, "eval_loss": 0.4889431893825531, "eval_rewards/accuracies": 0.7484999895095825, "eval_rewards/chosen": -3.2922675609588623, "eval_rewards/margins": 1.3106337785720825, "eval_rewards/rejected": -4.602901458740234, "eval_runtime": 1590.2933, "eval_samples_per_second": 1.258, "eval_steps_per_second": 0.314, "step": 6600 }, { "epoch": 0.86, "grad_norm": 11.1875, "learning_rate": 2.7271705561027986e-07, "logits/chosen": 0.3130180835723877, "logits/rejected": 0.7469149827957153, "logps/chosen": -608.0856323242188, "logps/rejected": -725.796875, "loss": 0.5438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.389724016189575, "rewards/margins": 1.0343810319900513, "rewards/rejected": -4.424104690551758, "step": 6610 }, { "epoch": 0.87, "grad_norm": 28.0, "learning_rate": 2.6755335541943677e-07, "logits/chosen": 0.19360849261283875, "logits/rejected": 1.0395077466964722, "logps/chosen": -598.2835693359375, "logps/rejected": -704.2481689453125, "loss": 0.4799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.331153392791748, "rewards/margins": 1.3434398174285889, "rewards/rejected": -4.674593448638916, "step": 6620 }, { "epoch": 0.87, "grad_norm": 10.4375, "learning_rate": 2.62436244082781e-07, "logits/chosen": 0.21957476437091827, "logits/rejected": 0.7514572143554688, "logps/chosen": -567.141845703125, "logps/rejected": -697.9176025390625, "loss": 0.4481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1435608863830566, "rewards/margins": 1.4471392631530762, "rewards/rejected": -4.590699672698975, "step": 6630 }, { "epoch": 0.87, "grad_norm": 7.125, "learning_rate": 2.5736582838913836e-07, "logits/chosen": 0.22806484997272491, "logits/rejected": 0.9933506846427917, "logps/chosen": -621.5804443359375, "logps/rejected": -714.3753662109375, "loss": 0.46, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.256880521774292, "rewards/margins": 1.346673846244812, "rewards/rejected": -4.6035542488098145, "step": 6640 }, { "epoch": 0.87, "grad_norm": 16.0, "learning_rate": 2.5234221415284363e-07, "logits/chosen": 0.29249635338783264, "logits/rejected": 1.1430519819259644, "logps/chosen": -599.8883666992188, "logps/rejected": -712.6788330078125, "loss": 0.4889, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0099844932556152, "rewards/margins": 1.4878642559051514, "rewards/rejected": -4.4978485107421875, "step": 6650 }, { "epoch": 0.87, "grad_norm": 14.5625, "learning_rate": 2.4736550621153375e-07, "logits/chosen": 0.7163133025169373, "logits/rejected": 1.4217084646224976, "logps/chosen": -590.7764892578125, "logps/rejected": -702.0975952148438, "loss": 0.5071, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.3552703857421875, "rewards/margins": 1.2537267208099365, "rewards/rejected": -4.608997344970703, "step": 6660 }, { "epoch": 0.87, "grad_norm": 15.3125, "learning_rate": 2.424358084239609e-07, "logits/chosen": 0.09558672457933426, "logits/rejected": 1.1775563955307007, "logps/chosen": -608.3162231445312, "logps/rejected": -671.51220703125, "loss": 0.5659, "rewards/accuracies": 0.6875, "rewards/chosen": -3.291945219039917, "rewards/margins": 1.0342015027999878, "rewards/rejected": -4.326146125793457, "step": 6670 }, { "epoch": 0.87, "grad_norm": 18.375, "learning_rate": 2.3755322366782158e-07, "logits/chosen": 0.4346315860748291, "logits/rejected": 0.7198947072029114, "logps/chosen": -591.690673828125, "logps/rejected": -744.840087890625, "loss": 0.5241, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.6358695030212402, "rewards/margins": 1.2703872919082642, "rewards/rejected": -4.906256675720215, "step": 6680 }, { "epoch": 0.88, "grad_norm": 7.71875, "learning_rate": 2.3271785383761431e-07, "logits/chosen": 0.19641201198101044, "logits/rejected": 1.4842437505722046, "logps/chosen": -642.49853515625, "logps/rejected": -760.596435546875, "loss": 0.4263, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.3886337280273438, "rewards/margins": 1.6620216369628906, "rewards/rejected": -5.050655364990234, "step": 6690 }, { "epoch": 0.88, "grad_norm": 17.125, "learning_rate": 2.2792979984250978e-07, "logits/chosen": 0.26097798347473145, "logits/rejected": 1.3378149271011353, "logps/chosen": -581.4967651367188, "logps/rejected": -659.17578125, "loss": 0.574, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.349140167236328, "rewards/margins": 1.2462130784988403, "rewards/rejected": -4.595353603363037, "step": 6700 }, { "epoch": 0.88, "eval_logits/chosen": 1.0140420198440552, "eval_logits/rejected": 1.757304072380066, "eval_logps/chosen": -592.4088745117188, "eval_logps/rejected": -703.44287109375, "eval_loss": 0.4887068569660187, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -3.277878522872925, "eval_rewards/margins": 1.3107616901397705, "eval_rewards/rejected": -4.588640213012695, "eval_runtime": 1592.3239, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 6700 }, { "epoch": 0.88, "grad_norm": 6.375, "learning_rate": 2.231891616042453e-07, "logits/chosen": 0.10554889589548111, "logits/rejected": 1.7582786083221436, "logps/chosen": -627.1198120117188, "logps/rejected": -784.3391723632812, "loss": 0.3874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3611323833465576, "rewards/margins": 1.8259894847869873, "rewards/rejected": -5.187121868133545, "step": 6710 }, { "epoch": 0.88, "grad_norm": 20.25, "learning_rate": 2.1849603805504328e-07, "logits/chosen": 0.6380084753036499, "logits/rejected": 1.636945366859436, "logps/chosen": -549.2799072265625, "logps/rejected": -701.5870971679688, "loss": 0.4267, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3033134937286377, "rewards/margins": 1.6281543970108032, "rewards/rejected": -4.9314680099487305, "step": 6720 }, { "epoch": 0.88, "grad_norm": 17.5, "learning_rate": 2.1385052713554066e-07, "logits/chosen": -0.12183733284473419, "logits/rejected": 1.4319367408752441, "logps/chosen": -617.0516967773438, "logps/rejected": -651.3189086914062, "loss": 0.4917, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2507731914520264, "rewards/margins": 1.1351972818374634, "rewards/rejected": -4.385970592498779, "step": 6730 }, { "epoch": 0.88, "grad_norm": 6.3125, "learning_rate": 2.0925272579274873e-07, "logits/chosen": 0.11045311391353607, "logits/rejected": 0.18677489459514618, "logps/chosen": -570.3370361328125, "logps/rejected": -668.0396728515625, "loss": 0.5353, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.2531864643096924, "rewards/margins": 1.0719640254974365, "rewards/rejected": -4.325150489807129, "step": 6740 }, { "epoch": 0.88, "grad_norm": 17.5, "learning_rate": 2.047027299780302e-07, "logits/chosen": 0.4011760354042053, "logits/rejected": 1.157820224761963, "logps/chosen": -590.8807373046875, "logps/rejected": -714.6888427734375, "loss": 0.5, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.279855251312256, "rewards/margins": 1.36622953414917, "rewards/rejected": -4.646084785461426, "step": 6750 }, { "epoch": 0.88, "grad_norm": 13.875, "learning_rate": 2.0020063464509492e-07, "logits/chosen": 0.5368996858596802, "logits/rejected": 1.6831638813018799, "logps/chosen": -544.4091186523438, "logps/rejected": -690.4616088867188, "loss": 0.4165, "rewards/accuracies": 0.8125, "rewards/chosen": -3.224353313446045, "rewards/margins": 1.5393102169036865, "rewards/rejected": -4.7636637687683105, "step": 6760 }, { "epoch": 0.89, "grad_norm": 7.34375, "learning_rate": 1.957465337480191e-07, "logits/chosen": 0.08371684700250626, "logits/rejected": 1.7062151432037354, "logps/chosen": -610.7371826171875, "logps/rejected": -743.8865966796875, "loss": 0.3572, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.165255308151245, "rewards/margins": 1.8801519870758057, "rewards/rejected": -5.045407295227051, "step": 6770 }, { "epoch": 0.89, "grad_norm": 17.5, "learning_rate": 1.9134052023928622e-07, "logits/chosen": 0.4712475836277008, "logits/rejected": 1.06361722946167, "logps/chosen": -583.5921020507812, "logps/rejected": -697.35302734375, "loss": 0.4598, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1947989463806152, "rewards/margins": 1.396549105644226, "rewards/rejected": -4.591347694396973, "step": 6780 }, { "epoch": 0.89, "grad_norm": 17.125, "learning_rate": 1.8698268606784392e-07, "logits/chosen": 0.4350413680076599, "logits/rejected": 1.8006197214126587, "logps/chosen": -607.4656372070312, "logps/rejected": -689.8092041015625, "loss": 0.4508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.259523391723633, "rewards/margins": 1.4721763134002686, "rewards/rejected": -4.7316999435424805, "step": 6790 }, { "epoch": 0.89, "grad_norm": 22.75, "learning_rate": 1.826731221771866e-07, "logits/chosen": 0.5540640950202942, "logits/rejected": 0.7710912227630615, "logps/chosen": -551.3302612304688, "logps/rejected": -673.9688720703125, "loss": 0.5737, "rewards/accuracies": 0.6875, "rewards/chosen": -3.183842420578003, "rewards/margins": 1.0950297117233276, "rewards/rejected": -4.278872013092041, "step": 6800 }, { "epoch": 0.89, "eval_logits/chosen": 1.0125845670700073, "eval_logits/rejected": 1.755982518196106, "eval_logps/chosen": -593.7937622070312, "eval_logps/rejected": -704.9940185546875, "eval_loss": 0.4887009859085083, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -3.291727066040039, "eval_rewards/margins": 1.312423586845398, "eval_rewards/rejected": -4.604150295257568, "eval_runtime": 1591.5754, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 6800 }, { "epoch": 0.89, "grad_norm": 19.0, "learning_rate": 1.7841191850345967e-07, "logits/chosen": 0.5574437379837036, "logits/rejected": 1.2674754858016968, "logps/chosen": -571.6987915039062, "logps/rejected": -722.4050903320312, "loss": 0.4244, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2846603393554688, "rewards/margins": 1.5775883197784424, "rewards/rejected": -4.862248420715332, "step": 6810 }, { "epoch": 0.89, "grad_norm": 22.625, "learning_rate": 1.7419916397357905e-07, "logits/chosen": 0.705112099647522, "logits/rejected": 1.1643078327178955, "logps/chosen": -580.8120727539062, "logps/rejected": -690.5592041015625, "loss": 0.5157, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.315786838531494, "rewards/margins": 1.2436031103134155, "rewards/rejected": -4.559390068054199, "step": 6820 }, { "epoch": 0.89, "grad_norm": 8.8125, "learning_rate": 1.700349465033782e-07, "logits/chosen": 0.3245560824871063, "logits/rejected": 1.4476044178009033, "logps/chosen": -562.4835815429688, "logps/rejected": -703.9220581054688, "loss": 0.3617, "rewards/accuracies": 0.8125, "rewards/chosen": -3.087944269180298, "rewards/margins": 1.636804223060608, "rewards/rejected": -4.7247490882873535, "step": 6830 }, { "epoch": 0.9, "grad_norm": 13.4375, "learning_rate": 1.6591935299577227e-07, "logits/chosen": 0.8055804967880249, "logits/rejected": 1.5917952060699463, "logps/chosen": -572.6065673828125, "logps/rejected": -755.8960571289062, "loss": 0.4138, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.3246803283691406, "rewards/margins": 1.7669527530670166, "rewards/rejected": -5.091632843017578, "step": 6840 }, { "epoch": 0.9, "grad_norm": 15.6875, "learning_rate": 1.6185246933894338e-07, "logits/chosen": 0.25553420186042786, "logits/rejected": 1.0945329666137695, "logps/chosen": -616.6585693359375, "logps/rejected": -763.8131713867188, "loss": 0.3875, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3123505115509033, "rewards/margins": 1.6443926095962524, "rewards/rejected": -4.956742763519287, "step": 6850 }, { "epoch": 0.9, "grad_norm": 16.625, "learning_rate": 1.5783438040455097e-07, "logits/chosen": 0.12614001333713531, "logits/rejected": 1.4914333820343018, "logps/chosen": -577.5618286132812, "logps/rejected": -689.0383911132812, "loss": 0.3949, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0827829837799072, "rewards/margins": 1.6236732006072998, "rewards/rejected": -4.706456184387207, "step": 6860 }, { "epoch": 0.9, "grad_norm": 10.6875, "learning_rate": 1.538651700459576e-07, "logits/chosen": 0.025020074099302292, "logits/rejected": 0.9685807228088379, "logps/chosen": -585.4603271484375, "logps/rejected": -646.5584716796875, "loss": 0.5232, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.115504503250122, "rewards/margins": 1.0184857845306396, "rewards/rejected": -4.133990287780762, "step": 6870 }, { "epoch": 0.9, "grad_norm": 14.0, "learning_rate": 1.4994492109648151e-07, "logits/chosen": 0.2431701421737671, "logits/rejected": 0.8176721334457397, "logps/chosen": -597.099365234375, "logps/rejected": -721.96337890625, "loss": 0.4917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4174275398254395, "rewards/margins": 1.211806058883667, "rewards/rejected": -4.629233360290527, "step": 6880 }, { "epoch": 0.9, "grad_norm": 8.5, "learning_rate": 1.4607371536766695e-07, "logits/chosen": 0.5689305067062378, "logits/rejected": 1.2870957851409912, "logps/chosen": -623.2686767578125, "logps/rejected": -730.4143676757812, "loss": 0.4955, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.4705052375793457, "rewards/margins": 1.2539684772491455, "rewards/rejected": -4.72447395324707, "step": 6890 }, { "epoch": 0.9, "grad_norm": 12.8125, "learning_rate": 1.4225163364757655e-07, "logits/chosen": -0.0002982348087243736, "logits/rejected": 1.2865327596664429, "logps/chosen": -572.536865234375, "logps/rejected": -690.8985595703125, "loss": 0.4298, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1573309898376465, "rewards/margins": 1.4404720067977905, "rewards/rejected": -4.597803592681885, "step": 6900 }, { "epoch": 0.9, "eval_logits/chosen": 1.0130374431610107, "eval_logits/rejected": 1.7562763690948486, "eval_logps/chosen": -594.4664306640625, "eval_logps/rejected": -705.733154296875, "eval_loss": 0.4889255464076996, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -3.2984538078308105, "eval_rewards/margins": 1.3130884170532227, "eval_rewards/rejected": -4.611542224884033, "eval_runtime": 1591.8913, "eval_samples_per_second": 1.256, "eval_steps_per_second": 0.314, "step": 6900 }, { "epoch": 0.9, "grad_norm": 20.5, "learning_rate": 1.3847875569910462e-07, "logits/chosen": 0.5036298632621765, "logits/rejected": 1.5973215103149414, "logps/chosen": -556.7360229492188, "logps/rejected": -693.60009765625, "loss": 0.4498, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1470251083374023, "rewards/margins": 1.5626558065414429, "rewards/rejected": -4.709681034088135, "step": 6910 }, { "epoch": 0.91, "grad_norm": 7.21875, "learning_rate": 1.3475516025831552e-07, "logits/chosen": 0.2578235864639282, "logits/rejected": 1.1093521118164062, "logps/chosen": -571.198974609375, "logps/rejected": -716.4463500976562, "loss": 0.4159, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.120373249053955, "rewards/margins": 1.5449237823486328, "rewards/rejected": -4.665297031402588, "step": 6920 }, { "epoch": 0.91, "grad_norm": 8.5625, "learning_rate": 1.310809250327974e-07, "logits/chosen": 0.6653910875320435, "logits/rejected": 0.986367404460907, "logps/chosen": -549.1756591796875, "logps/rejected": -682.2640991210938, "loss": 0.5015, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.378861665725708, "rewards/margins": 1.2142317295074463, "rewards/rejected": -4.593092918395996, "step": 6930 }, { "epoch": 0.91, "grad_norm": 38.5, "learning_rate": 1.2745612670004153e-07, "logits/chosen": 0.09889905154705048, "logits/rejected": 1.0996273756027222, "logps/chosen": -607.4309692382812, "logps/rejected": -675.5977172851562, "loss": 0.5235, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2739291191101074, "rewards/margins": 1.0797994136810303, "rewards/rejected": -4.353728294372559, "step": 6940 }, { "epoch": 0.91, "grad_norm": 18.75, "learning_rate": 1.2388084090584395e-07, "logits/chosen": 0.2793113589286804, "logits/rejected": 1.003348469734192, "logps/chosen": -564.1325073242188, "logps/rejected": -657.8829345703125, "loss": 0.5175, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3109850883483887, "rewards/margins": 1.2358797788619995, "rewards/rejected": -4.5468645095825195, "step": 6950 }, { "epoch": 0.91, "grad_norm": 10.375, "learning_rate": 1.2035514226272305e-07, "logits/chosen": 0.45842522382736206, "logits/rejected": 1.0828557014465332, "logps/chosen": -558.7763061523438, "logps/rejected": -696.6474609375, "loss": 0.4225, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.125001907348633, "rewards/margins": 1.363785743713379, "rewards/rejected": -4.488787651062012, "step": 6960 }, { "epoch": 0.91, "grad_norm": 12.75, "learning_rate": 1.1687910434836607e-07, "logits/chosen": 0.1126895397901535, "logits/rejected": 1.165924310684204, "logps/chosen": -629.3409423828125, "logps/rejected": -714.7694091796875, "loss": 0.446, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.295433759689331, "rewards/margins": 1.2763313055038452, "rewards/rejected": -4.571765422821045, "step": 6970 }, { "epoch": 0.91, "grad_norm": 14.1875, "learning_rate": 1.1345279970409128e-07, "logits/chosen": 0.44308653473854065, "logits/rejected": 0.8887012600898743, "logps/chosen": -580.622802734375, "logps/rejected": -748.8364868164062, "loss": 0.4164, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9695210456848145, "rewards/margins": 1.5574761629104614, "rewards/rejected": -4.526997089385986, "step": 6980 }, { "epoch": 0.91, "grad_norm": 13.9375, "learning_rate": 1.1007629983333629e-07, "logits/chosen": 0.2842895984649658, "logits/rejected": 0.597855806350708, "logps/chosen": -570.9032592773438, "logps/rejected": -714.6888427734375, "loss": 0.5177, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.316150665283203, "rewards/margins": 1.3857309818267822, "rewards/rejected": -4.7018818855285645, "step": 6990 }, { "epoch": 0.92, "grad_norm": 10.6875, "learning_rate": 1.067496752001626e-07, "logits/chosen": 0.09682926535606384, "logits/rejected": 0.9496966600418091, "logps/chosen": -586.10888671875, "logps/rejected": -670.243896484375, "loss": 0.55, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.264702558517456, "rewards/margins": 1.1133906841278076, "rewards/rejected": -4.378093242645264, "step": 7000 }, { "epoch": 0.92, "eval_logits/chosen": 1.0131945610046387, "eval_logits/rejected": 1.7566964626312256, "eval_logps/chosen": -594.5901489257812, "eval_logps/rejected": -705.9526977539062, "eval_loss": 0.4888876676559448, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -3.2996912002563477, "eval_rewards/margins": 1.3140465021133423, "eval_rewards/rejected": -4.613737106323242, "eval_runtime": 1591.6581, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 7000 }, { "epoch": 0.92, "grad_norm": 20.75, "learning_rate": 1.0347299522778909e-07, "logits/chosen": 0.4687480032444, "logits/rejected": 1.1106703281402588, "logps/chosen": -580.3803100585938, "logps/rejected": -705.0828247070312, "loss": 0.5507, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.449733018875122, "rewards/margins": 1.257815957069397, "rewards/rejected": -4.707549095153809, "step": 7010 }, { "epoch": 0.92, "grad_norm": 31.5, "learning_rate": 1.0024632829713971e-07, "logits/chosen": 0.4061342179775238, "logits/rejected": 1.314821481704712, "logps/chosen": -599.4215087890625, "logps/rejected": -715.9715576171875, "loss": 0.4931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3331425189971924, "rewards/margins": 1.471777319908142, "rewards/rejected": -4.804919242858887, "step": 7020 }, { "epoch": 0.92, "grad_norm": 7.75, "learning_rate": 9.706974174541889e-08, "logits/chosen": 0.375336229801178, "logits/rejected": 0.8008115887641907, "logps/chosen": -570.6932983398438, "logps/rejected": -681.4544677734375, "loss": 0.476, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.402949571609497, "rewards/margins": 1.1330082416534424, "rewards/rejected": -4.5359578132629395, "step": 7030 }, { "epoch": 0.92, "grad_norm": 21.125, "learning_rate": 9.39433018647043e-08, "logits/chosen": 0.37960708141326904, "logits/rejected": 1.2306239604949951, "logps/chosen": -596.1632080078125, "logps/rejected": -692.4556884765625, "loss": 0.4308, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.3076729774475098, "rewards/margins": 1.342871904373169, "rewards/rejected": -4.6505446434021, "step": 7040 }, { "epoch": 0.92, "grad_norm": 7.03125, "learning_rate": 9.086707390056543e-08, "logits/chosen": 0.22423484921455383, "logits/rejected": 0.9336155652999878, "logps/chosen": -581.1544799804688, "logps/rejected": -696.49609375, "loss": 0.4646, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2052035331726074, "rewards/margins": 1.3599494695663452, "rewards/rejected": -4.565153121948242, "step": 7050 }, { "epoch": 0.92, "grad_norm": 10.375, "learning_rate": 8.784112205070083e-08, "logits/chosen": 0.09238873422145844, "logits/rejected": 1.2443900108337402, "logps/chosen": -570.7318115234375, "logps/rejected": -729.5119018554688, "loss": 0.3718, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9837429523468018, "rewards/margins": 1.624291181564331, "rewards/rejected": -4.608033657073975, "step": 7060 }, { "epoch": 0.93, "grad_norm": 18.5, "learning_rate": 8.486550946359779e-08, "logits/chosen": 0.5402406454086304, "logits/rejected": 0.9340842962265015, "logps/chosen": -619.3458251953125, "logps/rejected": -691.1231689453125, "loss": 0.4756, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.4807777404785156, "rewards/margins": 1.2306329011917114, "rewards/rejected": -4.7114105224609375, "step": 7070 }, { "epoch": 0.93, "grad_norm": 22.875, "learning_rate": 8.194029823721556e-08, "logits/chosen": 0.4674089848995209, "logits/rejected": 1.4463164806365967, "logps/chosen": -595.6998901367188, "logps/rejected": -724.3187255859375, "loss": 0.5678, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.4917690753936768, "rewards/margins": 1.3110939264297485, "rewards/rejected": -4.802863121032715, "step": 7080 }, { "epoch": 0.93, "grad_norm": 17.0, "learning_rate": 7.906554941768896e-08, "logits/chosen": 0.3445693254470825, "logits/rejected": 0.8292980194091797, "logps/chosen": -568.9578857421875, "logps/rejected": -723.1265869140625, "loss": 0.5519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.342053174972534, "rewards/margins": 1.3594369888305664, "rewards/rejected": -4.7014899253845215, "step": 7090 }, { "epoch": 0.93, "grad_norm": 19.375, "learning_rate": 7.624132299805575e-08, "logits/chosen": 0.2580556273460388, "logits/rejected": 1.3959585428237915, "logps/chosen": -638.3900146484375, "logps/rejected": -735.552978515625, "loss": 0.4123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3954243659973145, "rewards/margins": 1.474463701248169, "rewards/rejected": -4.869887828826904, "step": 7100 }, { "epoch": 0.93, "eval_logits/chosen": 1.0150725841522217, "eval_logits/rejected": 1.7585645914077759, "eval_logps/chosen": -594.8818969726562, "eval_logps/rejected": -706.2577514648438, "eval_loss": 0.4888802468776703, "eval_rewards/accuracies": 0.7515000104904175, "eval_rewards/chosen": -3.302608013153076, "eval_rewards/margins": 1.3141798973083496, "eval_rewards/rejected": -4.616787910461426, "eval_runtime": 1591.5021, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 7100 }, { "epoch": 0.93, "grad_norm": 12.375, "learning_rate": 7.346767791700127e-08, "logits/chosen": 0.40067845582962036, "logits/rejected": 0.7876986861228943, "logps/chosen": -565.1066284179688, "logps/rejected": -656.0726318359375, "loss": 0.6681, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.3934097290039062, "rewards/margins": 0.9237259030342102, "rewards/rejected": -4.317135810852051, "step": 7110 }, { "epoch": 0.93, "grad_norm": 5.75, "learning_rate": 7.07446720576327e-08, "logits/chosen": 0.2647624611854553, "logits/rejected": 0.5136129260063171, "logps/chosen": -589.279296875, "logps/rejected": -711.5758666992188, "loss": 0.5057, "rewards/accuracies": 0.75, "rewards/chosen": -3.370063066482544, "rewards/margins": 1.0963002443313599, "rewards/rejected": -4.466362953186035, "step": 7120 }, { "epoch": 0.93, "grad_norm": 9.0625, "learning_rate": 6.807236224626701e-08, "logits/chosen": 0.5471646189689636, "logits/rejected": 0.7506142854690552, "logps/chosen": -575.2327880859375, "logps/rejected": -703.8402099609375, "loss": 0.4122, "rewards/accuracies": 0.8125, "rewards/chosen": -3.049259901046753, "rewards/margins": 1.5313215255737305, "rewards/rejected": -4.5805816650390625, "step": 7130 }, { "epoch": 0.93, "grad_norm": 5.90625, "learning_rate": 6.545080425124888e-08, "logits/chosen": 0.6009904146194458, "logits/rejected": 1.5046106576919556, "logps/chosen": -560.8460693359375, "logps/rejected": -698.7578735351562, "loss": 0.4021, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1606037616729736, "rewards/margins": 1.6776975393295288, "rewards/rejected": -4.838300704956055, "step": 7140 }, { "epoch": 0.94, "grad_norm": 7.0, "learning_rate": 6.288005278178382e-08, "logits/chosen": 0.10890443623065948, "logits/rejected": 1.0314215421676636, "logps/chosen": -591.5215454101562, "logps/rejected": -700.6073608398438, "loss": 0.4078, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2524211406707764, "rewards/margins": 1.4063465595245361, "rewards/rejected": -4.658768177032471, "step": 7150 }, { "epoch": 0.94, "grad_norm": 14.6875, "learning_rate": 6.036016148679825e-08, "logits/chosen": 0.10704119503498077, "logits/rejected": 1.2058827877044678, "logps/chosen": -549.1723022460938, "logps/rejected": -699.8775634765625, "loss": 0.3788, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9703383445739746, "rewards/margins": 1.7024837732315063, "rewards/rejected": -4.672821998596191, "step": 7160 }, { "epoch": 0.94, "grad_norm": 13.25, "learning_rate": 5.7891182953819235e-08, "logits/chosen": 0.6250897645950317, "logits/rejected": 0.94916170835495, "logps/chosen": -609.9509887695312, "logps/rejected": -693.9453125, "loss": 0.5902, "rewards/accuracies": 0.6875, "rewards/chosen": -3.5461039543151855, "rewards/margins": 1.0024358034133911, "rewards/rejected": -4.548539638519287, "step": 7170 }, { "epoch": 0.94, "grad_norm": 14.9375, "learning_rate": 5.547316870787689e-08, "logits/chosen": 0.0662744864821434, "logits/rejected": 1.2459442615509033, "logps/chosen": -588.7398681640625, "logps/rejected": -661.6343994140625, "loss": 0.5033, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.239532947540283, "rewards/margins": 1.3408972024917603, "rewards/rejected": -4.580430507659912, "step": 7180 }, { "epoch": 0.94, "grad_norm": 13.625, "learning_rate": 5.310616921042927e-08, "logits/chosen": 0.13475020229816437, "logits/rejected": 1.097031593322754, "logps/chosen": -666.7498779296875, "logps/rejected": -723.42236328125, "loss": 0.4635, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3409552574157715, "rewards/margins": 1.3054221868515015, "rewards/rejected": -4.6463775634765625, "step": 7190 }, { "epoch": 0.94, "grad_norm": 21.25, "learning_rate": 5.079023385830939e-08, "logits/chosen": 0.4373611509799957, "logits/rejected": 1.603753685951233, "logps/chosen": -611.4661865234375, "logps/rejected": -711.8818359375, "loss": 0.5207, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.7718963623046875, "rewards/margins": 1.09917414188385, "rewards/rejected": -4.871070384979248, "step": 7200 }, { "epoch": 0.94, "eval_logits/chosen": 1.01255202293396, "eval_logits/rejected": 1.7557493448257446, "eval_logps/chosen": -595.11279296875, "eval_logps/rejected": -706.500732421875, "eval_loss": 0.48874008655548096, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -3.304917097091675, "eval_rewards/margins": 1.3143013715744019, "eval_rewards/rejected": -4.619218826293945, "eval_runtime": 1591.398, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 7200 }, { "epoch": 0.94, "grad_norm": 22.125, "learning_rate": 4.8525410982695476e-08, "logits/chosen": 0.41507038474082947, "logits/rejected": 1.2446469068527222, "logps/chosen": -608.064453125, "logps/rejected": -712.0289306640625, "loss": 0.5465, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.3873703479766846, "rewards/margins": 1.2731060981750488, "rewards/rejected": -4.6604766845703125, "step": 7210 }, { "epoch": 0.94, "grad_norm": 7.0, "learning_rate": 4.6311747848099e-08, "logits/chosen": -0.24930362403392792, "logits/rejected": 1.102052927017212, "logps/chosen": -652.6702270507812, "logps/rejected": -732.0322875976562, "loss": 0.5089, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.4032764434814453, "rewards/margins": 1.2973884344100952, "rewards/rejected": -4.700665473937988, "step": 7220 }, { "epoch": 0.95, "grad_norm": 18.875, "learning_rate": 4.4149290651382405e-08, "logits/chosen": 0.3960798382759094, "logits/rejected": 1.1941249370574951, "logps/chosen": -602.910888671875, "logps/rejected": -672.2167358398438, "loss": 0.5062, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.242851972579956, "rewards/margins": 1.2035329341888428, "rewards/rejected": -4.446384429931641, "step": 7230 }, { "epoch": 0.95, "grad_norm": 31.25, "learning_rate": 4.203808452079211e-08, "logits/chosen": 0.12929697334766388, "logits/rejected": 0.7588415741920471, "logps/chosen": -581.9747314453125, "logps/rejected": -684.8924560546875, "loss": 0.5113, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2949206829071045, "rewards/margins": 1.1291944980621338, "rewards/rejected": -4.424115180969238, "step": 7240 }, { "epoch": 0.95, "grad_norm": 7.34375, "learning_rate": 3.9978173515018427e-08, "logits/chosen": 0.22393746674060822, "logits/rejected": 0.6054302453994751, "logps/chosen": -650.0538330078125, "logps/rejected": -768.4112548828125, "loss": 0.537, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4565773010253906, "rewards/margins": 1.2421902418136597, "rewards/rejected": -4.69876766204834, "step": 7250 }, { "epoch": 0.95, "grad_norm": 9.75, "learning_rate": 3.7969600622274614e-08, "logits/chosen": 0.7730494141578674, "logits/rejected": 0.8779703378677368, "logps/chosen": -616.2741088867188, "logps/rejected": -757.5372314453125, "loss": 0.5026, "rewards/accuracies": 0.6875, "rewards/chosen": -3.5739378929138184, "rewards/margins": 1.1644906997680664, "rewards/rejected": -4.738428592681885, "step": 7260 }, { "epoch": 0.95, "grad_norm": 11.875, "learning_rate": 3.601240775940151e-08, "logits/chosen": 0.0924004465341568, "logits/rejected": 0.7291241884231567, "logps/chosen": -585.1265869140625, "logps/rejected": -715.2276611328125, "loss": 0.442, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0802805423736572, "rewards/margins": 1.3805367946624756, "rewards/rejected": -4.460817337036133, "step": 7270 }, { "epoch": 0.95, "grad_norm": 9.875, "learning_rate": 3.410663577099071e-08, "logits/chosen": 0.8634665608406067, "logits/rejected": 1.24403715133667, "logps/chosen": -625.5086059570312, "logps/rejected": -712.8633422851562, "loss": 0.6482, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.5116772651672363, "rewards/margins": 1.0819501876831055, "rewards/rejected": -4.593627452850342, "step": 7280 }, { "epoch": 0.95, "grad_norm": 11.9375, "learning_rate": 3.2252324428534986e-08, "logits/chosen": 0.0949554294347763, "logits/rejected": 1.7730538845062256, "logps/chosen": -645.7748413085938, "logps/rejected": -777.1041259765625, "loss": 0.4368, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.345384120941162, "rewards/margins": 1.7858293056488037, "rewards/rejected": -5.131213188171387, "step": 7290 }, { "epoch": 0.96, "grad_norm": 12.0625, "learning_rate": 3.0449512429594486e-08, "logits/chosen": 0.6525617837905884, "logits/rejected": 1.2398340702056885, "logps/chosen": -587.5560302734375, "logps/rejected": -742.075927734375, "loss": 0.4618, "rewards/accuracies": 0.75, "rewards/chosen": -3.530935764312744, "rewards/margins": 1.445012092590332, "rewards/rejected": -4.975947856903076, "step": 7300 }, { "epoch": 0.96, "eval_logits/chosen": 1.0115752220153809, "eval_logits/rejected": 1.7552307844161987, "eval_logps/chosen": -594.8143310546875, "eval_logps/rejected": -706.2247314453125, "eval_loss": 0.48876285552978516, "eval_rewards/accuracies": 0.7515000104904175, "eval_rewards/chosen": -3.3019332885742188, "eval_rewards/margins": 1.3145242929458618, "eval_rewards/rejected": -4.616457462310791, "eval_runtime": 1590.7324, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 7300 }, { "epoch": 0.96, "grad_norm": 7.6875, "learning_rate": 2.8698237396992956e-08, "logits/chosen": 0.09208633005619049, "logits/rejected": 1.0006811618804932, "logps/chosen": -608.3304443359375, "logps/rejected": -736.591796875, "loss": 0.4175, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.210888624191284, "rewards/margins": 1.6330093145370483, "rewards/rejected": -4.843898296356201, "step": 7310 }, { "epoch": 0.96, "grad_norm": 11.5625, "learning_rate": 2.6998535878030584e-08, "logits/chosen": 0.43088826537132263, "logits/rejected": 1.5745205879211426, "logps/chosen": -615.910400390625, "logps/rejected": -689.7060546875, "loss": 0.5256, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.2760322093963623, "rewards/margins": 1.1446540355682373, "rewards/rejected": -4.4206862449646, "step": 7320 }, { "epoch": 0.96, "grad_norm": 10.25, "learning_rate": 2.535044334372072e-08, "logits/chosen": 0.48195523023605347, "logits/rejected": 1.243807077407837, "logps/chosen": -566.0392456054688, "logps/rejected": -699.8907470703125, "loss": 0.5006, "rewards/accuracies": 0.8125, "rewards/chosen": -3.182238817214966, "rewards/margins": 1.3754639625549316, "rewards/rejected": -4.557702541351318, "step": 7330 }, { "epoch": 0.96, "grad_norm": 7.84375, "learning_rate": 2.3753994188051853e-08, "logits/chosen": -0.08187036216259003, "logits/rejected": 0.7814757227897644, "logps/chosen": -654.4878540039062, "logps/rejected": -722.797119140625, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": -3.5121543407440186, "rewards/margins": 0.8491532206535339, "rewards/rejected": -4.361307621002197, "step": 7340 }, { "epoch": 0.96, "grad_norm": 26.125, "learning_rate": 2.220922172726764e-08, "logits/chosen": 0.5146197080612183, "logits/rejected": 1.2585347890853882, "logps/chosen": -602.0792236328125, "logps/rejected": -677.6658935546875, "loss": 0.6084, "rewards/accuracies": 0.6875, "rewards/chosen": -3.289280414581299, "rewards/margins": 1.0045863389968872, "rewards/rejected": -4.2938666343688965, "step": 7350 }, { "epoch": 0.96, "grad_norm": 23.75, "learning_rate": 2.071615819917244e-08, "logits/chosen": 0.22979173064231873, "logits/rejected": 1.6183487176895142, "logps/chosen": -617.9466552734375, "logps/rejected": -690.7569580078125, "loss": 0.3736, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.1348488330841064, "rewards/margins": 1.4200165271759033, "rewards/rejected": -4.55486536026001, "step": 7360 }, { "epoch": 0.96, "grad_norm": 10.25, "learning_rate": 1.9274834762459393e-08, "logits/chosen": 0.6498265266418457, "logits/rejected": 1.3851298093795776, "logps/chosen": -553.2167358398438, "logps/rejected": -704.5006103515625, "loss": 0.4863, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.3176722526550293, "rewards/margins": 1.5971444845199585, "rewards/rejected": -4.914816856384277, "step": 7370 }, { "epoch": 0.97, "grad_norm": 6.75, "learning_rate": 1.7885281496058947e-08, "logits/chosen": 0.7782396674156189, "logits/rejected": 1.534798264503479, "logps/chosen": -580.90478515625, "logps/rejected": -677.3753662109375, "loss": 0.5651, "rewards/accuracies": 0.75, "rewards/chosen": -3.2415664196014404, "rewards/margins": 1.3312580585479736, "rewards/rejected": -4.572824001312256, "step": 7380 }, { "epoch": 0.97, "grad_norm": 28.125, "learning_rate": 1.654752739851134e-08, "logits/chosen": 0.4140414297580719, "logits/rejected": 1.5184310674667358, "logps/chosen": -642.1239013671875, "logps/rejected": -669.8280029296875, "loss": 0.537, "rewards/accuracies": 0.75, "rewards/chosen": -3.256727695465088, "rewards/margins": 1.1267528533935547, "rewards/rejected": -4.383481025695801, "step": 7390 }, { "epoch": 0.97, "grad_norm": 20.0, "learning_rate": 1.526160038736235e-08, "logits/chosen": 0.35390472412109375, "logits/rejected": 1.5663673877716064, "logps/chosen": -628.6697998046875, "logps/rejected": -712.1144409179688, "loss": 0.4826, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.2142441272735596, "rewards/margins": 1.3702830076217651, "rewards/rejected": -4.584527015686035, "step": 7400 }, { "epoch": 0.97, "eval_logits/chosen": 1.0107791423797607, "eval_logits/rejected": 1.7537970542907715, "eval_logps/chosen": -594.97314453125, "eval_logps/rejected": -706.3511962890625, "eval_loss": 0.48887208104133606, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -3.303520679473877, "eval_rewards/margins": 1.3142021894454956, "eval_rewards/rejected": -4.617722988128662, "eval_runtime": 1591.0726, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 7400 }, { "epoch": 0.97, "grad_norm": 31.125, "learning_rate": 1.402752729857959e-08, "logits/chosen": 0.4313858151435852, "logits/rejected": 1.6850160360336304, "logps/chosen": -596.0479125976562, "logps/rejected": -672.6542358398438, "loss": 0.5335, "rewards/accuracies": 0.6875, "rewards/chosen": -3.3667705059051514, "rewards/margins": 1.3584734201431274, "rewards/rejected": -4.725244045257568, "step": 7410 }, { "epoch": 0.97, "grad_norm": 17.25, "learning_rate": 1.2845333885992683e-08, "logits/chosen": 0.5016809701919556, "logits/rejected": 1.1508241891860962, "logps/chosen": -554.3850708007812, "logps/rejected": -715.5885009765625, "loss": 0.4485, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1848483085632324, "rewards/margins": 1.6087548732757568, "rewards/rejected": -4.79360294342041, "step": 7420 }, { "epoch": 0.97, "grad_norm": 10.0, "learning_rate": 1.171504482075675e-08, "logits/chosen": 0.2105075567960739, "logits/rejected": 1.2978436946868896, "logps/chosen": -595.2945556640625, "logps/rejected": -743.7322998046875, "loss": 0.3998, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3599941730499268, "rewards/margins": 1.502739667892456, "rewards/rejected": -4.862734317779541, "step": 7430 }, { "epoch": 0.97, "grad_norm": 20.625, "learning_rate": 1.0636683690836147e-08, "logits/chosen": 0.3214600086212158, "logits/rejected": 1.136541485786438, "logps/chosen": -607.3497314453125, "logps/rejected": -740.9457397460938, "loss": 0.4688, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1903975009918213, "rewards/margins": 1.4278573989868164, "rewards/rejected": -4.618254661560059, "step": 7440 }, { "epoch": 0.97, "grad_norm": 21.625, "learning_rate": 9.610273000513203e-09, "logits/chosen": 0.7849928140640259, "logits/rejected": 1.3118616342544556, "logps/chosen": -529.127685546875, "logps/rejected": -659.9454345703125, "loss": 0.4025, "rewards/accuracies": 0.875, "rewards/chosen": -3.1130189895629883, "rewards/margins": 1.4227343797683716, "rewards/rejected": -4.5357537269592285, "step": 7450 }, { "epoch": 0.98, "grad_norm": 12.75, "learning_rate": 8.635834169918312e-09, "logits/chosen": -0.15050581097602844, "logits/rejected": 0.86566561460495, "logps/chosen": -588.2030029296875, "logps/rejected": -701.0858764648438, "loss": 0.3991, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.156752109527588, "rewards/margins": 1.3994879722595215, "rewards/rejected": -4.556240081787109, "step": 7460 }, { "epoch": 0.98, "grad_norm": 14.0, "learning_rate": 7.713387534582506e-09, "logits/chosen": 0.42161521315574646, "logits/rejected": 0.6920258402824402, "logps/chosen": -543.2078857421875, "logps/rejected": -679.2174072265625, "loss": 0.566, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.1884536743164062, "rewards/margins": 1.1040431261062622, "rewards/rejected": -4.292496681213379, "step": 7470 }, { "epoch": 0.98, "grad_norm": 16.25, "learning_rate": 6.84295234501392e-09, "logits/chosen": 0.3836548924446106, "logits/rejected": 0.7305153012275696, "logps/chosen": -597.7833251953125, "logps/rejected": -677.3738403320312, "loss": 0.5487, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.133619785308838, "rewards/margins": 1.116659164428711, "rewards/rejected": -4.250278949737549, "step": 7480 }, { "epoch": 0.98, "grad_norm": 23.0, "learning_rate": 6.024546766295325e-09, "logits/chosen": 0.15691399574279785, "logits/rejected": 1.376615047454834, "logps/chosen": -587.3973999023438, "logps/rejected": -657.0569458007812, "loss": 0.5322, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.09767746925354, "rewards/margins": 1.212154507637024, "rewards/rejected": -4.309831619262695, "step": 7490 }, { "epoch": 0.98, "grad_norm": 9.3125, "learning_rate": 5.2581878777049895e-09, "logits/chosen": 0.4897291660308838, "logits/rejected": 1.0999479293823242, "logps/chosen": -582.9024658203125, "logps/rejected": -714.1553344726562, "loss": 0.3856, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.102313995361328, "rewards/margins": 1.7228889465332031, "rewards/rejected": -4.825202941894531, "step": 7500 }, { "epoch": 0.98, "eval_logits/chosen": 1.011379361152649, "eval_logits/rejected": 1.7544424533843994, "eval_logps/chosen": -595.0473022460938, "eval_logps/rejected": -706.4486083984375, "eval_loss": 0.48872044682502747, "eval_rewards/accuracies": 0.7515000104904175, "eval_rewards/chosen": -3.30426287651062, "eval_rewards/margins": 1.3144340515136719, "eval_rewards/rejected": -4.618696689605713, "eval_runtime": 1590.4964, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 7500 }, { "epoch": 0.98, "grad_norm": 13.3125, "learning_rate": 4.543891672361411e-09, "logits/chosen": -0.033980078995227814, "logits/rejected": 1.000163197517395, "logps/chosen": -629.8569946289062, "logps/rejected": -730.5193481445312, "loss": 0.4221, "rewards/accuracies": 0.75, "rewards/chosen": -3.1672351360321045, "rewards/margins": 1.4683955907821655, "rewards/rejected": -4.635631084442139, "step": 7510 }, { "epoch": 0.98, "grad_norm": 10.25, "learning_rate": 3.881673056887747e-09, "logits/chosen": 0.13274362683296204, "logits/rejected": 1.4995901584625244, "logps/chosen": -599.1785888671875, "logps/rejected": -693.8605346679688, "loss": 0.428, "rewards/accuracies": 0.75, "rewards/chosen": -3.077873945236206, "rewards/margins": 1.6000537872314453, "rewards/rejected": -4.677927494049072, "step": 7520 }, { "epoch": 0.99, "grad_norm": 23.5, "learning_rate": 3.2715458511023425e-09, "logits/chosen": 0.3830450475215912, "logits/rejected": 0.914400577545166, "logps/chosen": -604.7692260742188, "logps/rejected": -697.35205078125, "loss": 0.587, "rewards/accuracies": 0.6875, "rewards/chosen": -3.5337085723876953, "rewards/margins": 0.94794762134552, "rewards/rejected": -4.481656074523926, "step": 7530 }, { "epoch": 0.99, "grad_norm": 13.6875, "learning_rate": 2.7135227877289617e-09, "logits/chosen": 0.5699089765548706, "logits/rejected": 0.8099034428596497, "logps/chosen": -581.1547241210938, "logps/rejected": -740.4284057617188, "loss": 0.45, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.312006711959839, "rewards/margins": 1.4148095846176147, "rewards/rejected": -4.726816654205322, "step": 7540 }, { "epoch": 0.99, "grad_norm": 21.875, "learning_rate": 2.2076155121328326e-09, "logits/chosen": 0.5921443700790405, "logits/rejected": 1.261713981628418, "logps/chosen": -579.9949951171875, "logps/rejected": -733.7827758789062, "loss": 0.3957, "rewards/accuracies": 0.8125, "rewards/chosen": -3.289013385772705, "rewards/margins": 1.5766642093658447, "rewards/rejected": -4.865677833557129, "step": 7550 }, { "epoch": 0.99, "grad_norm": 16.125, "learning_rate": 1.7538345820755641e-09, "logits/chosen": 0.4986654222011566, "logits/rejected": 1.1568987369537354, "logps/chosen": -579.21533203125, "logps/rejected": -667.2409057617188, "loss": 0.5121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.2680678367614746, "rewards/margins": 1.1933481693267822, "rewards/rejected": -4.461415767669678, "step": 7560 }, { "epoch": 0.99, "grad_norm": 14.5625, "learning_rate": 1.3521894674961567e-09, "logits/chosen": 0.21069379150867462, "logits/rejected": 1.3896992206573486, "logps/chosen": -589.5611572265625, "logps/rejected": -748.1316528320312, "loss": 0.4201, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2263946533203125, "rewards/margins": 1.7248976230621338, "rewards/rejected": -4.951291561126709, "step": 7570 }, { "epoch": 0.99, "grad_norm": 14.75, "learning_rate": 1.0026885503131023e-09, "logits/chosen": 0.10079088062047958, "logits/rejected": 1.2259687185287476, "logps/chosen": -587.1923828125, "logps/rejected": -704.1217041015625, "loss": 0.4449, "rewards/accuracies": 0.8125, "rewards/chosen": -3.298996686935425, "rewards/margins": 1.4512828588485718, "rewards/rejected": -4.750279426574707, "step": 7580 }, { "epoch": 0.99, "grad_norm": 16.375, "learning_rate": 7.053391242492491e-10, "logits/chosen": 0.2513376772403717, "logits/rejected": 0.9676704406738281, "logps/chosen": -625.3098754882812, "logps/rejected": -719.0193481445312, "loss": 0.5372, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.4935874938964844, "rewards/margins": 1.0755943059921265, "rewards/rejected": -4.5691819190979, "step": 7590 }, { "epoch": 0.99, "grad_norm": 8.4375, "learning_rate": 4.6014739467997725e-10, "logits/chosen": 0.40907493233680725, "logits/rejected": 0.9747940301895142, "logps/chosen": -537.2282104492188, "logps/rejected": -644.7360229492188, "loss": 0.5369, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1004512310028076, "rewards/margins": 1.0329300165176392, "rewards/rejected": -4.133380889892578, "step": 7600 }, { "epoch": 0.99, "eval_logits/chosen": 1.0126349925994873, "eval_logits/rejected": 1.755886197090149, "eval_logps/chosen": -594.9012451171875, "eval_logps/rejected": -706.3289794921875, "eval_loss": 0.4886167645454407, "eval_rewards/accuracies": 0.7519999742507935, "eval_rewards/chosen": -3.302802324295044, "eval_rewards/margins": 1.3146986961364746, "eval_rewards/rejected": -4.6175007820129395, "eval_runtime": 1590.8922, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 7600 }, { "epoch": 1.0, "grad_norm": 14.375, "learning_rate": 2.671184785033032e-10, "logits/chosen": 0.04215417057275772, "logits/rejected": 1.0146968364715576, "logps/chosen": -626.7977905273438, "logps/rejected": -724.8660278320312, "loss": 0.4751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.042105197906494, "rewards/margins": 1.3117393255233765, "rewards/rejected": -4.353843688964844, "step": 7610 }, { "epoch": 1.0, "grad_norm": 16.375, "learning_rate": 1.2625640403302054e-10, "logits/chosen": 0.33571863174438477, "logits/rejected": 0.9231777191162109, "logps/chosen": -619.7828369140625, "logps/rejected": -719.2232666015625, "loss": 0.5526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4431312084198, "rewards/margins": 1.2789758443832397, "rewards/rejected": -4.72210693359375, "step": 7620 }, { "epoch": 1.0, "grad_norm": 8.625, "learning_rate": 3.756411091515588e-11, "logits/chosen": 0.18591374158859253, "logits/rejected": 1.280806303024292, "logps/chosen": -558.523681640625, "logps/rejected": -691.3430786132812, "loss": 0.4201, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0532612800598145, "rewards/margins": 1.6573164463043213, "rewards/rejected": -4.710577487945557, "step": 7630 }, { "epoch": 1.0, "grad_norm": 7.5625, "learning_rate": 1.0434500657963143e-12, "logits/chosen": 0.10529494285583496, "logits/rejected": 1.4016101360321045, "logps/chosen": -626.654296875, "logps/rejected": -703.9794921875, "loss": 0.5513, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.382859706878662, "rewards/margins": 1.3783118724822998, "rewards/rejected": -4.761171817779541, "step": 7640 }, { "epoch": 1.0, "step": 7642, "total_flos": 0.0, "train_loss": 0.5201432038994555, "train_runtime": 240641.1375, "train_samples_per_second": 0.254, "train_steps_per_second": 0.032 } ], "logging_steps": 10, "max_steps": 7642, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }