{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5095, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019627085377821394, "grad_norm": 55.414124236430546, "learning_rate": 9.803921568627451e-10, "logits/chosen": -2.9195547103881836, "logits/rejected": -2.4565553665161133, "logps/chosen": -421.782470703125, "logps/rejected": -89.33955383300781, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001962708537782139, "grad_norm": 52.15274864950026, "learning_rate": 9.803921568627451e-09, "logits/chosen": -2.5578269958496094, "logits/rejected": -2.5533926486968994, "logps/chosen": -328.4997863769531, "logps/rejected": -224.72509765625, "loss": 1.0, "rewards/accuracies": 0.3703703284263611, "rewards/chosen": 0.011046407744288445, "rewards/margins": -0.09586013108491898, "rewards/rejected": 0.10690654069185257, "step": 10 }, { "epoch": 0.003925417075564278, "grad_norm": 56.996684673463676, "learning_rate": 1.9607843137254902e-08, "logits/chosen": -2.748328447341919, "logits/rejected": -2.6488404273986816, "logps/chosen": -241.4247589111328, "logps/rejected": -228.7529296875, "loss": 0.9999, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.15143682062625885, "rewards/margins": 0.06219663470983505, "rewards/rejected": 0.089240163564682, "step": 20 }, { "epoch": 0.005888125613346418, "grad_norm": 45.312543510435866, "learning_rate": 2.941176470588235e-08, "logits/chosen": -2.805752754211426, "logits/rejected": -2.749796152114868, "logps/chosen": -271.8738098144531, "logps/rejected": -277.1117858886719, "loss": 1.0004, "rewards/accuracies": 0.36666664481163025, "rewards/chosen": -0.13984277844429016, "rewards/margins": -0.2147332727909088, "rewards/rejected": 0.07489053159952164, "step": 30 }, { "epoch": 0.007850834151128557, "grad_norm": 57.481567251372226, "learning_rate": 3.9215686274509804e-08, "logits/chosen": -2.52890944480896, "logits/rejected": -2.6134731769561768, "logps/chosen": -235.9799346923828, "logps/rejected": -199.0916748046875, "loss": 0.9998, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 0.1794951856136322, "rewards/margins": 0.17383304238319397, "rewards/rejected": 0.00566215580329299, "step": 40 }, { "epoch": 0.009813542688910697, "grad_norm": 48.97067866151835, "learning_rate": 4.901960784313725e-08, "logits/chosen": -2.765376091003418, "logits/rejected": -2.730837821960449, "logps/chosen": -265.35858154296875, "logps/rejected": -283.53167724609375, "loss": 0.9999, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 0.06280431896448135, "rewards/margins": -0.005956542678177357, "rewards/rejected": 0.06876085698604584, "step": 50 }, { "epoch": 0.011776251226692836, "grad_norm": 46.64095654430081, "learning_rate": 5.88235294117647e-08, "logits/chosen": -2.755647659301758, "logits/rejected": -2.671654224395752, "logps/chosen": -254.79672241210938, "logps/rejected": -236.0987548828125, "loss": 0.9994, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 0.09678258746862411, "rewards/margins": 0.06553478538990021, "rewards/rejected": 0.031247805804014206, "step": 60 }, { "epoch": 0.013738959764474975, "grad_norm": 48.5689303983752, "learning_rate": 6.862745098039216e-08, "logits/chosen": -2.8365824222564697, "logits/rejected": -2.7729034423828125, "logps/chosen": -301.89404296875, "logps/rejected": -237.1268768310547, "loss": 0.9994, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 0.21221618354320526, "rewards/margins": 0.05830637365579605, "rewards/rejected": 0.1539098173379898, "step": 70 }, { "epoch": 0.015701668302257114, "grad_norm": 56.96202081528457, "learning_rate": 7.843137254901961e-08, "logits/chosen": -2.758204936981201, "logits/rejected": -2.5587191581726074, "logps/chosen": -324.0185241699219, "logps/rejected": -213.40060424804688, "loss": 0.9988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.25671494007110596, "rewards/margins": 0.2088421881198883, "rewards/rejected": 0.04787272959947586, "step": 80 }, { "epoch": 0.017664376840039256, "grad_norm": 56.56285398890676, "learning_rate": 8.823529411764706e-08, "logits/chosen": -2.821187973022461, "logits/rejected": -2.797405242919922, "logps/chosen": -269.6439208984375, "logps/rejected": -272.57781982421875, "loss": 0.998, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.5080066919326782, "rewards/margins": 0.18387934565544128, "rewards/rejected": 0.32412731647491455, "step": 90 }, { "epoch": 0.019627085377821395, "grad_norm": 47.150366516448564, "learning_rate": 9.80392156862745e-08, "logits/chosen": -2.792006492614746, "logits/rejected": -2.697812557220459, "logps/chosen": -295.50555419921875, "logps/rejected": -259.6533508300781, "loss": 0.9973, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.5169113278388977, "rewards/margins": 0.25743037462234497, "rewards/rejected": 0.25948095321655273, "step": 100 }, { "epoch": 0.021589793915603533, "grad_norm": 55.468925381397625, "learning_rate": 1.0784313725490195e-07, "logits/chosen": -2.7970244884490967, "logits/rejected": -2.7083277702331543, "logps/chosen": -331.6293029785156, "logps/rejected": -292.5093078613281, "loss": 0.9967, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.1082704067230225, "rewards/margins": 0.6746741533279419, "rewards/rejected": 0.4335961937904358, "step": 110 }, { "epoch": 0.023552502453385672, "grad_norm": 42.612365590574086, "learning_rate": 1.176470588235294e-07, "logits/chosen": -2.647759437561035, "logits/rejected": -2.551598072052002, "logps/chosen": -202.25650024414062, "logps/rejected": -188.682861328125, "loss": 0.9941, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 0.6571747064590454, "rewards/margins": 0.9068604707717896, "rewards/rejected": -0.24968579411506653, "step": 120 }, { "epoch": 0.02551521099116781, "grad_norm": 50.3982402598436, "learning_rate": 1.2745098039215685e-07, "logits/chosen": -2.5919909477233887, "logits/rejected": -2.6269984245300293, "logps/chosen": -359.5844421386719, "logps/rejected": -300.26177978515625, "loss": 0.992, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.5606809854507446, "rewards/margins": 0.9813000559806824, "rewards/rejected": 0.5793809294700623, "step": 130 }, { "epoch": 0.02747791952894995, "grad_norm": 69.50110312136805, "learning_rate": 1.3725490196078432e-07, "logits/chosen": -2.686027765274048, "logits/rejected": -2.7174320220947266, "logps/chosen": -192.32054138183594, "logps/rejected": -194.1149139404297, "loss": 0.988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8719283938407898, "rewards/margins": 1.41945219039917, "rewards/rejected": -0.5475236773490906, "step": 140 }, { "epoch": 0.029440628066732092, "grad_norm": 54.63373156683644, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -2.841104507446289, "logits/rejected": -2.734062671661377, "logps/chosen": -233.43362426757812, "logps/rejected": -225.4843292236328, "loss": 0.9875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7091665267944336, "rewards/margins": 0.8218281865119934, "rewards/rejected": -0.11266165971755981, "step": 150 }, { "epoch": 0.03140333660451423, "grad_norm": 64.47137392637033, "learning_rate": 1.5686274509803921e-07, "logits/chosen": -2.7783737182617188, "logits/rejected": -2.680637836456299, "logps/chosen": -277.5608825683594, "logps/rejected": -219.86453247070312, "loss": 0.9718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.978581428527832, "rewards/margins": 2.63038969039917, "rewards/rejected": -1.6518083810806274, "step": 160 }, { "epoch": 0.033366045142296366, "grad_norm": 55.9302712973517, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -2.669224262237549, "logits/rejected": -2.644956111907959, "logps/chosen": -248.44046020507812, "logps/rejected": -208.7925262451172, "loss": 0.9744, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.34655627608299255, "rewards/margins": 1.9494075775146484, "rewards/rejected": -2.295963764190674, "step": 170 }, { "epoch": 0.03532875368007851, "grad_norm": 38.93661910641483, "learning_rate": 1.764705882352941e-07, "logits/chosen": -2.752145290374756, "logits/rejected": -2.648857831954956, "logps/chosen": -259.3789978027344, "logps/rejected": -250.989990234375, "loss": 0.9782, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.5382688641548157, "rewards/margins": 3.4019508361816406, "rewards/rejected": -3.9402194023132324, "step": 180 }, { "epoch": 0.03729146221786065, "grad_norm": 63.778016387242715, "learning_rate": 1.8627450980392158e-07, "logits/chosen": -2.649264097213745, "logits/rejected": -2.5917062759399414, "logps/chosen": -317.41717529296875, "logps/rejected": -259.6824645996094, "loss": 0.9769, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": 0.7618657946586609, "rewards/margins": 3.695021152496338, "rewards/rejected": -2.9331555366516113, "step": 190 }, { "epoch": 0.03925417075564279, "grad_norm": 71.16321823510685, "learning_rate": 1.96078431372549e-07, "logits/chosen": -2.7696406841278076, "logits/rejected": -2.5761637687683105, "logps/chosen": -283.6011657714844, "logps/rejected": -199.72254943847656, "loss": 0.9608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2294278144836426, "rewards/margins": 6.6622314453125, "rewards/rejected": -3.4328041076660156, "step": 200 }, { "epoch": 0.04121687929342493, "grad_norm": 54.324281743069825, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -2.5626652240753174, "logits/rejected": -2.4831199645996094, "logps/chosen": -242.19656372070312, "logps/rejected": -239.0122833251953, "loss": 0.9402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1904563456773758, "rewards/margins": 3.7318828105926514, "rewards/rejected": -3.541426420211792, "step": 210 }, { "epoch": 0.04317958783120707, "grad_norm": 54.049498509984915, "learning_rate": 2.156862745098039e-07, "logits/chosen": -2.735288143157959, "logits/rejected": -2.6886754035949707, "logps/chosen": -288.646240234375, "logps/rejected": -291.86566162109375, "loss": 0.9463, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -0.446907103061676, "rewards/margins": 5.959353446960449, "rewards/rejected": -6.4062604904174805, "step": 220 }, { "epoch": 0.045142296368989206, "grad_norm": 50.40537380547292, "learning_rate": 2.2549019607843137e-07, "logits/chosen": -2.767138957977295, "logits/rejected": -2.6347174644470215, "logps/chosen": -268.5491638183594, "logps/rejected": -210.41104125976562, "loss": 0.9406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6417080163955688, "rewards/margins": 6.2437591552734375, "rewards/rejected": -7.885466575622559, "step": 230 }, { "epoch": 0.047105004906771344, "grad_norm": 58.56373729015666, "learning_rate": 2.352941176470588e-07, "logits/chosen": -2.6756463050842285, "logits/rejected": -2.656825304031372, "logps/chosen": -250.88662719726562, "logps/rejected": -262.0466003417969, "loss": 0.9347, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -0.9389527440071106, "rewards/margins": 13.691622734069824, "rewards/rejected": -14.630575180053711, "step": 240 }, { "epoch": 0.04906771344455348, "grad_norm": 62.91248653422729, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -2.6354706287384033, "logits/rejected": -2.5464749336242676, "logps/chosen": -244.25607299804688, "logps/rejected": -214.0111846923828, "loss": 0.9249, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -4.828089237213135, "rewards/margins": 6.163465976715088, "rewards/rejected": -10.991555213928223, "step": 250 }, { "epoch": 0.05103042198233562, "grad_norm": 58.90444503921364, "learning_rate": 2.549019607843137e-07, "logits/chosen": -2.7298409938812256, "logits/rejected": -2.705272912979126, "logps/chosen": -317.15704345703125, "logps/rejected": -258.4947204589844, "loss": 0.9244, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.5937615633010864, "rewards/margins": 9.848703384399414, "rewards/rejected": -9.254941940307617, "step": 260 }, { "epoch": 0.05299313052011776, "grad_norm": 70.29034254567453, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -2.6792712211608887, "logits/rejected": -2.6439366340637207, "logps/chosen": -224.71426391601562, "logps/rejected": -226.5146484375, "loss": 0.9618, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 2.8806838989257812, "rewards/margins": 6.444188594818115, "rewards/rejected": -3.563504695892334, "step": 270 }, { "epoch": 0.0549558390578999, "grad_norm": 73.8671570512029, "learning_rate": 2.7450980392156863e-07, "logits/chosen": -2.708838939666748, "logits/rejected": -2.622032403945923, "logps/chosen": -244.63595581054688, "logps/rejected": -214.27084350585938, "loss": 0.9036, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.792637288570404, "rewards/margins": 13.196484565734863, "rewards/rejected": -12.403847694396973, "step": 280 }, { "epoch": 0.05691854759568204, "grad_norm": 49.61807698046266, "learning_rate": 2.8431372549019607e-07, "logits/chosen": -2.826683521270752, "logits/rejected": -2.6963696479797363, "logps/chosen": -298.2339782714844, "logps/rejected": -244.4069061279297, "loss": 0.8822, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 6.4862799644470215, "rewards/margins": 19.688907623291016, "rewards/rejected": -13.202627182006836, "step": 290 }, { "epoch": 0.058881256133464184, "grad_norm": 68.04359512174663, "learning_rate": 2.941176470588235e-07, "logits/chosen": -2.701723575592041, "logits/rejected": -2.6761786937713623, "logps/chosen": -285.5689697265625, "logps/rejected": -313.41937255859375, "loss": 0.8987, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -2.276944398880005, "rewards/margins": 10.958003997802734, "rewards/rejected": -13.234947204589844, "step": 300 }, { "epoch": 0.06084396467124632, "grad_norm": 87.67158381820029, "learning_rate": 3.0392156862745094e-07, "logits/chosen": -2.708446979522705, "logits/rejected": -2.608660936355591, "logps/chosen": -301.845458984375, "logps/rejected": -249.30776977539062, "loss": 0.8796, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.20882701873779297, "rewards/margins": 13.04846477508545, "rewards/rejected": -12.839637756347656, "step": 310 }, { "epoch": 0.06280667320902845, "grad_norm": 76.16272662075276, "learning_rate": 3.1372549019607843e-07, "logits/chosen": -2.6973652839660645, "logits/rejected": -2.7176012992858887, "logps/chosen": -209.5578155517578, "logps/rejected": -233.25808715820312, "loss": 0.9276, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5.8708648681640625, "rewards/margins": 2.8086514472961426, "rewards/rejected": -8.67951488494873, "step": 320 }, { "epoch": 0.0647693817468106, "grad_norm": 64.08866978476804, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -2.7702319622039795, "logits/rejected": -2.719383478164673, "logps/chosen": -275.3776550292969, "logps/rejected": -239.3572998046875, "loss": 0.8705, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.6563128232955933, "rewards/margins": 26.388599395751953, "rewards/rejected": -25.732284545898438, "step": 330 }, { "epoch": 0.06673209028459273, "grad_norm": 91.86283169679935, "learning_rate": 3.333333333333333e-07, "logits/chosen": -2.8357901573181152, "logits/rejected": -2.670436143875122, "logps/chosen": -355.12750244140625, "logps/rejected": -273.102294921875, "loss": 0.864, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -4.8998565673828125, "rewards/margins": 27.38639259338379, "rewards/rejected": -32.28624725341797, "step": 340 }, { "epoch": 0.06869479882237488, "grad_norm": 75.94419876858274, "learning_rate": 3.431372549019608e-07, "logits/chosen": -2.771730422973633, "logits/rejected": -2.6880266666412354, "logps/chosen": -203.7973175048828, "logps/rejected": -193.547119140625, "loss": 0.9251, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -13.175801277160645, "rewards/margins": 3.0631513595581055, "rewards/rejected": -16.238950729370117, "step": 350 }, { "epoch": 0.07065750736015702, "grad_norm": 59.976909034674314, "learning_rate": 3.529411764705882e-07, "logits/chosen": -2.838757276535034, "logits/rejected": -2.621591091156006, "logps/chosen": -355.5475158691406, "logps/rejected": -291.84466552734375, "loss": 0.8435, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -5.186367988586426, "rewards/margins": 30.573680877685547, "rewards/rejected": -35.76005554199219, "step": 360 }, { "epoch": 0.07262021589793916, "grad_norm": 70.05872652589493, "learning_rate": 3.6274509803921566e-07, "logits/chosen": -2.8128840923309326, "logits/rejected": -2.7115535736083984, "logps/chosen": -271.34991455078125, "logps/rejected": -277.22418212890625, "loss": 0.9032, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 2.315098285675049, "rewards/margins": 16.740659713745117, "rewards/rejected": -14.425562858581543, "step": 370 }, { "epoch": 0.0745829244357213, "grad_norm": 66.55773679852149, "learning_rate": 3.7254901960784315e-07, "logits/chosen": -2.67268443107605, "logits/rejected": -2.795374631881714, "logps/chosen": -233.9839324951172, "logps/rejected": -302.9767761230469, "loss": 0.8688, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 6.1063432693481445, "rewards/margins": 16.245288848876953, "rewards/rejected": -10.138948440551758, "step": 380 }, { "epoch": 0.07654563297350343, "grad_norm": 56.55031235778978, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -2.6723198890686035, "logits/rejected": -2.462125062942505, "logps/chosen": -264.40460205078125, "logps/rejected": -293.06121826171875, "loss": 0.8722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.301637649536133, "rewards/margins": 16.670955657958984, "rewards/rejected": -14.369318008422852, "step": 390 }, { "epoch": 0.07850834151128558, "grad_norm": 66.62308677399767, "learning_rate": 3.92156862745098e-07, "logits/chosen": -2.780653715133667, "logits/rejected": -2.652039051055908, "logps/chosen": -263.01873779296875, "logps/rejected": -280.86932373046875, "loss": 0.9093, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.3608465194702148, "rewards/margins": 6.478080749511719, "rewards/rejected": -5.117233753204346, "step": 400 }, { "epoch": 0.08047105004906771, "grad_norm": 63.78400192013066, "learning_rate": 4.019607843137255e-07, "logits/chosen": -2.6911838054656982, "logits/rejected": -2.66676926612854, "logps/chosen": -314.56109619140625, "logps/rejected": -295.1566162109375, "loss": 0.8503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.199732780456543, "rewards/margins": 28.378488540649414, "rewards/rejected": -41.578216552734375, "step": 410 }, { "epoch": 0.08243375858684986, "grad_norm": 94.99850349627413, "learning_rate": 4.117647058823529e-07, "logits/chosen": -2.692110776901245, "logits/rejected": -2.672994375228882, "logps/chosen": -280.2340393066406, "logps/rejected": -322.0666198730469, "loss": 0.854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.604442596435547, "rewards/margins": 26.25628662109375, "rewards/rejected": -45.8607292175293, "step": 420 }, { "epoch": 0.08439646712463199, "grad_norm": 118.67917436651753, "learning_rate": 4.215686274509804e-07, "logits/chosen": -2.8876233100891113, "logits/rejected": -2.6736977100372314, "logps/chosen": -369.3075866699219, "logps/rejected": -268.67333984375, "loss": 0.8487, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5.0758161544799805, "rewards/margins": 23.310321807861328, "rewards/rejected": -28.386138916015625, "step": 430 }, { "epoch": 0.08635917566241413, "grad_norm": 50.695588379498645, "learning_rate": 4.313725490196078e-07, "logits/chosen": -2.6468257904052734, "logits/rejected": -2.5626094341278076, "logps/chosen": -292.5172119140625, "logps/rejected": -257.24615478515625, "loss": 0.8584, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 6.937422752380371, "rewards/margins": 21.171459197998047, "rewards/rejected": -14.234036445617676, "step": 440 }, { "epoch": 0.08832188420019627, "grad_norm": 73.21371867968895, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -2.8199658393859863, "logits/rejected": -2.8850109577178955, "logps/chosen": -281.73748779296875, "logps/rejected": -307.69390869140625, "loss": 0.8221, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.8003543615341187, "rewards/margins": 17.500152587890625, "rewards/rejected": -18.300504684448242, "step": 450 }, { "epoch": 0.09028459273797841, "grad_norm": 70.96073253983495, "learning_rate": 4.5098039215686274e-07, "logits/chosen": -2.540916681289673, "logits/rejected": -2.4045634269714355, "logps/chosen": -253.9958953857422, "logps/rejected": -290.2467041015625, "loss": 0.86, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -4.659354209899902, "rewards/margins": 27.491165161132812, "rewards/rejected": -32.150516510009766, "step": 460 }, { "epoch": 0.09224730127576054, "grad_norm": 62.670822295050996, "learning_rate": 4.6078431372549013e-07, "logits/chosen": -2.507478713989258, "logits/rejected": -2.3888206481933594, "logps/chosen": -266.78375244140625, "logps/rejected": -362.281005859375, "loss": 0.783, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -3.1778388023376465, "rewards/margins": 22.878177642822266, "rewards/rejected": -26.056011199951172, "step": 470 }, { "epoch": 0.09421000981354269, "grad_norm": 95.48242405873745, "learning_rate": 4.705882352941176e-07, "logits/chosen": -2.7770683765411377, "logits/rejected": -2.6778721809387207, "logps/chosen": -328.26507568359375, "logps/rejected": -365.73150634765625, "loss": 0.8265, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -8.122284889221191, "rewards/margins": 34.76969528198242, "rewards/rejected": -42.89198684692383, "step": 480 }, { "epoch": 0.09617271835132483, "grad_norm": 49.93423425130017, "learning_rate": 4.803921568627451e-07, "logits/chosen": -2.544484853744507, "logits/rejected": -2.4896888732910156, "logps/chosen": -298.1867980957031, "logps/rejected": -346.29254150390625, "loss": 0.8255, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.5153369903564453, "rewards/margins": 20.870067596435547, "rewards/rejected": -24.385404586791992, "step": 490 }, { "epoch": 0.09813542688910697, "grad_norm": 95.66170305864956, "learning_rate": 4.901960784313725e-07, "logits/chosen": -2.7141783237457275, "logits/rejected": -2.678609848022461, "logps/chosen": -306.56939697265625, "logps/rejected": -275.1187438964844, "loss": 0.842, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6.970645904541016, "rewards/margins": 17.500308990478516, "rewards/rejected": -24.470956802368164, "step": 500 }, { "epoch": 0.10009813542688911, "grad_norm": 69.86526686352707, "learning_rate": 5e-07, "logits/chosen": -2.598240613937378, "logits/rejected": -2.489713668823242, "logps/chosen": -298.87445068359375, "logps/rejected": -297.8564758300781, "loss": 0.8945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10.812700271606445, "rewards/margins": 14.292556762695312, "rewards/rejected": -25.10525894165039, "step": 510 }, { "epoch": 0.10206084396467124, "grad_norm": 57.044696962402206, "learning_rate": 4.999941314693213e-07, "logits/chosen": -2.67667555809021, "logits/rejected": -2.556001663208008, "logps/chosen": -246.2398223876953, "logps/rejected": -225.1736297607422, "loss": 0.8345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4034953117370605, "rewards/margins": 21.793445587158203, "rewards/rejected": -18.389949798583984, "step": 520 }, { "epoch": 0.10402355250245339, "grad_norm": 71.13779140496518, "learning_rate": 4.999765261528027e-07, "logits/chosen": -2.7669053077697754, "logits/rejected": -2.7825069427490234, "logps/chosen": -289.7940368652344, "logps/rejected": -345.27398681640625, "loss": 0.914, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.43848705291748047, "rewards/margins": 28.61753273010254, "rewards/rejected": -29.056018829345703, "step": 530 }, { "epoch": 0.10598626104023552, "grad_norm": 73.08189206277179, "learning_rate": 4.999471848769828e-07, "logits/chosen": -2.593777656555176, "logits/rejected": -2.612285852432251, "logps/chosen": -298.7412414550781, "logps/rejected": -347.7586975097656, "loss": 0.8021, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -8.63609504699707, "rewards/margins": 25.522174835205078, "rewards/rejected": -34.15827178955078, "step": 540 }, { "epoch": 0.10794896957801767, "grad_norm": 66.92080918944201, "learning_rate": 4.999061090193831e-07, "logits/chosen": -2.800966501235962, "logits/rejected": -2.6820874214172363, "logps/chosen": -330.7584228515625, "logps/rejected": -331.39276123046875, "loss": 0.8421, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7.950586795806885, "rewards/margins": 21.71441650390625, "rewards/rejected": -29.66500473022461, "step": 550 }, { "epoch": 0.1099116781157998, "grad_norm": 75.9075194202252, "learning_rate": 4.998533005084428e-07, "logits/chosen": -2.7314610481262207, "logits/rejected": -2.649965286254883, "logps/chosen": -278.94903564453125, "logps/rejected": -295.6121520996094, "loss": 0.8909, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 2.5758838653564453, "rewards/margins": 46.404876708984375, "rewards/rejected": -43.82899475097656, "step": 560 }, { "epoch": 0.11187438665358194, "grad_norm": 127.02048394376511, "learning_rate": 4.997887618234292e-07, "logits/chosen": -2.4379990100860596, "logits/rejected": -2.5773298740386963, "logps/chosen": -282.4049072265625, "logps/rejected": -339.1448974609375, "loss": 0.7104, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.7564314603805542, "rewards/margins": 29.629119873046875, "rewards/rejected": -30.385547637939453, "step": 570 }, { "epoch": 0.11383709519136408, "grad_norm": 71.92499556070872, "learning_rate": 4.997124959943201e-07, "logits/chosen": -2.607017993927002, "logits/rejected": -2.2096800804138184, "logps/chosen": -261.0818786621094, "logps/rejected": -246.7998809814453, "loss": 1.0073, "rewards/accuracies": 0.6333332657814026, "rewards/chosen": -17.173709869384766, "rewards/margins": 16.358301162719727, "rewards/rejected": -33.532012939453125, "step": 580 }, { "epoch": 0.11579980372914622, "grad_norm": 97.78550302044361, "learning_rate": 4.996245066016623e-07, "logits/chosen": -2.5673599243164062, "logits/rejected": -2.383335590362549, "logps/chosen": -259.57879638671875, "logps/rejected": -282.16583251953125, "loss": 0.7883, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -4.451306343078613, "rewards/margins": 46.018470764160156, "rewards/rejected": -50.46977615356445, "step": 590 }, { "epoch": 0.11776251226692837, "grad_norm": 124.31501090265762, "learning_rate": 4.995247977764035e-07, "logits/chosen": -2.568408727645874, "logits/rejected": -2.4662976264953613, "logps/chosen": -234.73257446289062, "logps/rejected": -261.03741455078125, "loss": 0.8213, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -0.1717933714389801, "rewards/margins": 51.04700469970703, "rewards/rejected": -51.21879959106445, "step": 600 }, { "epoch": 0.1197252208047105, "grad_norm": 96.71524998869944, "learning_rate": 4.994133741996982e-07, "logits/chosen": -2.6599957942962646, "logits/rejected": -2.5923123359680176, "logps/chosen": -295.23419189453125, "logps/rejected": -273.9298095703125, "loss": 0.8219, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -13.418383598327637, "rewards/margins": 21.37362289428711, "rewards/rejected": -34.79200744628906, "step": 610 }, { "epoch": 0.12168792934249265, "grad_norm": 94.07503589808053, "learning_rate": 4.992902411026877e-07, "logits/chosen": -2.5271780490875244, "logits/rejected": -2.487039089202881, "logps/chosen": -280.35870361328125, "logps/rejected": -385.655029296875, "loss": 0.8183, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": -14.516329765319824, "rewards/margins": 28.335102081298828, "rewards/rejected": -42.8514289855957, "step": 620 }, { "epoch": 0.12365063788027478, "grad_norm": 78.00602854614931, "learning_rate": 4.991554042662548e-07, "logits/chosen": -2.429823875427246, "logits/rejected": -2.5177104473114014, "logps/chosen": -258.1920471191406, "logps/rejected": -264.5993347167969, "loss": 0.8502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.239361763000488, "rewards/margins": 14.815203666687012, "rewards/rejected": -24.0545654296875, "step": 630 }, { "epoch": 0.1256133464180569, "grad_norm": 113.52863648657981, "learning_rate": 4.990088700207525e-07, "logits/chosen": -2.3707752227783203, "logits/rejected": -2.244995594024658, "logps/chosen": -212.91635131835938, "logps/rejected": -281.1128234863281, "loss": 0.7466, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -3.0918991565704346, "rewards/margins": 37.77779006958008, "rewards/rejected": -40.869686126708984, "step": 640 }, { "epoch": 0.12757605495583907, "grad_norm": 330.74550722397663, "learning_rate": 4.988506452457066e-07, "logits/chosen": -1.706993818283081, "logits/rejected": -1.4921681880950928, "logps/chosen": -305.51019287109375, "logps/rejected": -381.0821838378906, "loss": 0.8496, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -8.875078201293945, "rewards/margins": 75.9543228149414, "rewards/rejected": -84.82939147949219, "step": 650 }, { "epoch": 0.1295387634936212, "grad_norm": 70.76325514328447, "learning_rate": 4.986807373694925e-07, "logits/chosen": -2.4834747314453125, "logits/rejected": -2.3084394931793213, "logps/chosen": -264.1849365234375, "logps/rejected": -306.91748046875, "loss": 0.8295, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.19604988396167755, "rewards/margins": 38.94971466064453, "rewards/rejected": -39.14577102661133, "step": 660 }, { "epoch": 0.13150147203140333, "grad_norm": 147.03979827900244, "learning_rate": 4.984991543689869e-07, "logits/chosen": -2.833151340484619, "logits/rejected": -2.7283763885498047, "logps/chosen": -294.3988342285156, "logps/rejected": -315.2899475097656, "loss": 0.8617, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -11.000986099243164, "rewards/margins": 23.67992401123047, "rewards/rejected": -34.680912017822266, "step": 670 }, { "epoch": 0.13346418056918546, "grad_norm": 97.51952451597994, "learning_rate": 4.983059047691931e-07, "logits/chosen": -2.976386785507202, "logits/rejected": -2.9219889640808105, "logps/chosen": -271.909912109375, "logps/rejected": -245.0648956298828, "loss": 0.8612, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -13.436535835266113, "rewards/margins": 17.41008949279785, "rewards/rejected": -30.84662437438965, "step": 680 }, { "epoch": 0.13542688910696762, "grad_norm": 69.57736781603566, "learning_rate": 4.981009976428408e-07, "logits/chosen": -2.8275818824768066, "logits/rejected": -2.7157340049743652, "logps/chosen": -308.68328857421875, "logps/rejected": -292.2393798828125, "loss": 0.8682, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 2.6185543537139893, "rewards/margins": 28.4803524017334, "rewards/rejected": -25.86179542541504, "step": 690 }, { "epoch": 0.13738959764474976, "grad_norm": 96.0108020393921, "learning_rate": 4.9788444260996e-07, "logits/chosen": -2.824697732925415, "logits/rejected": -2.8100924491882324, "logps/chosen": -262.02093505859375, "logps/rejected": -270.205078125, "loss": 1.2209, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 2.1957712173461914, "rewards/margins": 14.532885551452637, "rewards/rejected": -12.337112426757812, "step": 700 }, { "epoch": 0.1393523061825319, "grad_norm": 195.39938881704745, "learning_rate": 4.976562498374295e-07, "logits/chosen": -2.8171865940093994, "logits/rejected": -2.6745545864105225, "logps/chosen": -294.8547668457031, "logps/rejected": -315.4266357421875, "loss": 0.8751, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -5.106291770935059, "rewards/margins": 51.43401336669922, "rewards/rejected": -56.540313720703125, "step": 710 }, { "epoch": 0.14131501472031405, "grad_norm": 62.34608005696844, "learning_rate": 4.974164300384997e-07, "logits/chosen": -2.694762706756592, "logits/rejected": -2.7580037117004395, "logps/chosen": -237.15847778320312, "logps/rejected": -321.07720947265625, "loss": 0.8405, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.954977512359619, "rewards/margins": 21.698471069335938, "rewards/rejected": -27.653446197509766, "step": 720 }, { "epoch": 0.14327772325809618, "grad_norm": 56.15211364058788, "learning_rate": 4.971649944722893e-07, "logits/chosen": -2.7886815071105957, "logits/rejected": -2.832386016845703, "logps/chosen": -261.1185302734375, "logps/rejected": -317.8330383300781, "loss": 0.7888, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8.679344177246094, "rewards/margins": 25.449604034423828, "rewards/rejected": -34.12894821166992, "step": 730 }, { "epoch": 0.1452404317958783, "grad_norm": 85.38026547736922, "learning_rate": 4.96901954943257e-07, "logits/chosen": -2.725878953933716, "logits/rejected": -2.3433279991149902, "logps/chosen": -271.3057556152344, "logps/rejected": -227.71517944335938, "loss": 0.7727, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -5.688392162322998, "rewards/margins": 55.4217643737793, "rewards/rejected": -61.11015701293945, "step": 740 }, { "epoch": 0.14720314033366044, "grad_norm": 81.59878782377339, "learning_rate": 4.96627323800647e-07, "logits/chosen": -2.4705214500427246, "logits/rejected": -2.497694969177246, "logps/chosen": -247.92465209960938, "logps/rejected": -283.5846252441406, "loss": 0.7692, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -10.23571491241455, "rewards/margins": 28.9940242767334, "rewards/rejected": -39.229736328125, "step": 750 }, { "epoch": 0.1491658488714426, "grad_norm": 105.20185517126171, "learning_rate": 4.963411139379099e-07, "logits/chosen": -2.396113395690918, "logits/rejected": -2.2180800437927246, "logps/chosen": -291.8468017578125, "logps/rejected": -286.48321533203125, "loss": 0.787, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2373937368392944, "rewards/margins": 24.7014102935791, "rewards/rejected": -25.938800811767578, "step": 760 }, { "epoch": 0.15112855740922473, "grad_norm": 79.90114580817118, "learning_rate": 4.960433387920964e-07, "logits/chosen": -2.2743828296661377, "logits/rejected": -2.201754331588745, "logps/chosen": -178.5857696533203, "logps/rejected": -318.49444580078125, "loss": 0.8094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.840551376342773, "rewards/margins": 16.877532958984375, "rewards/rejected": -25.71808433532715, "step": 770 }, { "epoch": 0.15309126594700687, "grad_norm": 90.34028453233641, "learning_rate": 4.957340123432271e-07, "logits/chosen": -2.325589418411255, "logits/rejected": -1.9675579071044922, "logps/chosen": -318.4504699707031, "logps/rejected": -277.85638427734375, "loss": 0.8095, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 14.117498397827148, "rewards/margins": 59.872711181640625, "rewards/rejected": -45.75520706176758, "step": 780 }, { "epoch": 0.155053974484789, "grad_norm": 83.65987229287128, "learning_rate": 4.954131491136361e-07, "logits/chosen": -2.2736544609069824, "logits/rejected": -2.1953623294830322, "logps/chosen": -334.32818603515625, "logps/rejected": -333.3956604003906, "loss": 0.8575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.934081077575684, "rewards/margins": 35.82530975341797, "rewards/rejected": -44.75939178466797, "step": 790 }, { "epoch": 0.15701668302257116, "grad_norm": 139.26646067462295, "learning_rate": 4.95080764167289e-07, "logits/chosen": -2.48186993598938, "logits/rejected": -2.4946374893188477, "logps/chosen": -236.434814453125, "logps/rejected": -331.59442138671875, "loss": 0.7043, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 13.882014274597168, "rewards/margins": 72.13981628417969, "rewards/rejected": -58.25780487060547, "step": 800 }, { "epoch": 0.1589793915603533, "grad_norm": 83.72396091422654, "learning_rate": 4.94736873109076e-07, "logits/chosen": -2.503469467163086, "logits/rejected": -2.3784656524658203, "logps/chosen": -265.27886962890625, "logps/rejected": -296.7031555175781, "loss": 0.7727, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -12.968899726867676, "rewards/margins": 51.905723571777344, "rewards/rejected": -64.87461853027344, "step": 810 }, { "epoch": 0.16094210009813542, "grad_norm": 189.29069739741868, "learning_rate": 4.943814920840787e-07, "logits/chosen": -2.197519302368164, "logits/rejected": -1.9932104349136353, "logps/chosen": -293.50787353515625, "logps/rejected": -302.1598815917969, "loss": 0.8619, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -15.313738822937012, "rewards/margins": 21.322572708129883, "rewards/rejected": -36.63631057739258, "step": 820 }, { "epoch": 0.16290480863591755, "grad_norm": 136.82961355882267, "learning_rate": 4.940146377768126e-07, "logits/chosen": -2.4040262699127197, "logits/rejected": -2.1411640644073486, "logps/chosen": -294.02484130859375, "logps/rejected": -320.5699157714844, "loss": 0.7606, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -6.624289512634277, "rewards/margins": 84.21747589111328, "rewards/rejected": -90.84175872802734, "step": 830 }, { "epoch": 0.1648675171736997, "grad_norm": 100.77888278626888, "learning_rate": 4.936363274104441e-07, "logits/chosen": -2.636396646499634, "logits/rejected": -2.5373311042785645, "logps/chosen": -309.7112731933594, "logps/rejected": -285.56781005859375, "loss": 0.8238, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -23.29251480102539, "rewards/margins": 35.46575164794922, "rewards/rejected": -58.758262634277344, "step": 840 }, { "epoch": 0.16683022571148184, "grad_norm": 125.42492200760786, "learning_rate": 4.932465787459808e-07, "logits/chosen": -2.721862554550171, "logits/rejected": -2.5102813243865967, "logps/chosen": -277.822265625, "logps/rejected": -292.1211853027344, "loss": 0.8472, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -11.745685577392578, "rewards/margins": 27.213459014892578, "rewards/rejected": -38.959144592285156, "step": 850 }, { "epoch": 0.16879293424926398, "grad_norm": 102.193525655049, "learning_rate": 4.92845410081439e-07, "logits/chosen": -2.482775926589966, "logits/rejected": -2.3314762115478516, "logps/chosen": -261.40966796875, "logps/rejected": -338.08807373046875, "loss": 0.7835, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -2.526902914047241, "rewards/margins": 51.9024543762207, "rewards/rejected": -54.429351806640625, "step": 860 }, { "epoch": 0.17075564278704614, "grad_norm": 115.87935210871359, "learning_rate": 4.924328402509833e-07, "logits/chosen": -2.372128486633301, "logits/rejected": -2.2885146141052246, "logps/chosen": -290.76171875, "logps/rejected": -279.03717041015625, "loss": 0.8609, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -16.973407745361328, "rewards/margins": 24.9534912109375, "rewards/rejected": -41.926902770996094, "step": 870 }, { "epoch": 0.17271835132482827, "grad_norm": 91.9583983291799, "learning_rate": 4.920088886240434e-07, "logits/chosen": -2.22208833694458, "logits/rejected": -1.8878209590911865, "logps/chosen": -283.6766052246094, "logps/rejected": -322.5338134765625, "loss": 0.7097, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.8781765103340149, "rewards/margins": 63.73139190673828, "rewards/rejected": -62.85321044921875, "step": 880 }, { "epoch": 0.1746810598626104, "grad_norm": 109.17651874471736, "learning_rate": 4.915735751044045e-07, "logits/chosen": -2.5790162086486816, "logits/rejected": -2.3382654190063477, "logps/chosen": -292.802978515625, "logps/rejected": -306.8439636230469, "loss": 0.8199, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.430455207824707, "rewards/margins": 55.915443420410156, "rewards/rejected": -60.34589385986328, "step": 890 }, { "epoch": 0.17664376840039253, "grad_norm": 96.61364295159595, "learning_rate": 4.911269201292724e-07, "logits/chosen": -2.6688895225524902, "logits/rejected": -2.5164670944213867, "logps/chosen": -303.59808349609375, "logps/rejected": -293.88421630859375, "loss": 1.1261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.23335075378418, "rewards/margins": 26.477336883544922, "rewards/rejected": -35.71068572998047, "step": 900 }, { "epoch": 0.1786064769381747, "grad_norm": 101.65905314805946, "learning_rate": 4.906689446683146e-07, "logits/chosen": -2.7267227172851562, "logits/rejected": -2.765730142593384, "logps/chosen": -239.03140258789062, "logps/rejected": -349.16693115234375, "loss": 0.8145, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -1.8973948955535889, "rewards/margins": 30.037006378173828, "rewards/rejected": -31.934402465820312, "step": 910 }, { "epoch": 0.18056918547595682, "grad_norm": 71.30262459226851, "learning_rate": 4.901996702226755e-07, "logits/chosen": -2.5833592414855957, "logits/rejected": -2.509138345718384, "logps/chosen": -310.91644287109375, "logps/rejected": -388.25543212890625, "loss": 0.7378, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -10.291021347045898, "rewards/margins": 40.18842315673828, "rewards/rejected": -50.47944259643555, "step": 920 }, { "epoch": 0.18253189401373895, "grad_norm": 96.31359613253149, "learning_rate": 4.897191188239667e-07, "logits/chosen": -2.779433250427246, "logits/rejected": -2.302976608276367, "logps/chosen": -324.71844482421875, "logps/rejected": -252.04452514648438, "loss": 0.7902, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.6819040179252625, "rewards/margins": 40.18635559082031, "rewards/rejected": -40.868263244628906, "step": 930 }, { "epoch": 0.1844946025515211, "grad_norm": 140.24838024568245, "learning_rate": 4.892273130332334e-07, "logits/chosen": -2.7375874519348145, "logits/rejected": -2.577573776245117, "logps/chosen": -327.95574951171875, "logps/rejected": -418.0184631347656, "loss": 0.8359, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.07719822227954865, "rewards/margins": 58.53068923950195, "rewards/rejected": -58.4535026550293, "step": 940 }, { "epoch": 0.18645731108930325, "grad_norm": 76.66142422220355, "learning_rate": 4.887242759398945e-07, "logits/chosen": -2.5748062133789062, "logits/rejected": -2.199622392654419, "logps/chosen": -203.68019104003906, "logps/rejected": -259.32379150390625, "loss": 0.724, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.095276832580566, "rewards/margins": 59.2440071105957, "rewards/rejected": -65.33927917480469, "step": 950 }, { "epoch": 0.18842001962708538, "grad_norm": 85.30326894447451, "learning_rate": 4.88210031160659e-07, "logits/chosen": -2.5354979038238525, "logits/rejected": -2.4008145332336426, "logps/chosen": -262.669921875, "logps/rejected": -293.5189514160156, "loss": 0.8034, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 4.096038818359375, "rewards/margins": 44.92534637451172, "rewards/rejected": -40.82931137084961, "step": 960 }, { "epoch": 0.1903827281648675, "grad_norm": 102.27266956632148, "learning_rate": 4.876846028384169e-07, "logits/chosen": -2.6679558753967285, "logits/rejected": -2.516143321990967, "logps/chosen": -211.1438446044922, "logps/rejected": -282.89447021484375, "loss": 0.8526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.259778022766113, "rewards/margins": 40.05314636230469, "rewards/rejected": -47.31291961669922, "step": 970 }, { "epoch": 0.19234543670264967, "grad_norm": 79.54364020865353, "learning_rate": 4.87148015641106e-07, "logits/chosen": -2.8125267028808594, "logits/rejected": -2.8086764812469482, "logps/chosen": -254.4888916015625, "logps/rejected": -300.8182067871094, "loss": 0.7906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.437692642211914, "rewards/margins": 22.498292922973633, "rewards/rejected": -25.935983657836914, "step": 980 }, { "epoch": 0.1943081452404318, "grad_norm": 49.982055994957676, "learning_rate": 4.866002947605539e-07, "logits/chosen": -2.738621473312378, "logits/rejected": -2.5494697093963623, "logps/chosen": -233.21725463867188, "logps/rejected": -274.02520751953125, "loss": 0.8497, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 3.6750988960266113, "rewards/margins": 35.068016052246094, "rewards/rejected": -31.39291763305664, "step": 990 }, { "epoch": 0.19627085377821393, "grad_norm": 79.17130890734822, "learning_rate": 4.860414659112948e-07, "logits/chosen": -2.6800758838653564, "logits/rejected": -2.2188613414764404, "logps/chosen": -242.1259765625, "logps/rejected": -245.241943359375, "loss": 0.8125, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -4.476802825927734, "rewards/margins": 48.006919860839844, "rewards/rejected": -52.483726501464844, "step": 1000 }, { "epoch": 0.19823356231599606, "grad_norm": 57.62240982578842, "learning_rate": 4.854715553293627e-07, "logits/chosen": -2.825094699859619, "logits/rejected": -2.3988935947418213, "logps/chosen": -303.86993408203125, "logps/rejected": -250.88583374023438, "loss": 0.7769, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 10.331136703491211, "rewards/margins": 58.67095184326172, "rewards/rejected": -48.339820861816406, "step": 1010 }, { "epoch": 0.20019627085377822, "grad_norm": 113.2137205006527, "learning_rate": 4.848905897710595e-07, "logits/chosen": -2.265653133392334, "logits/rejected": -2.0467350482940674, "logps/chosen": -338.335205078125, "logps/rejected": -269.71435546875, "loss": 0.8587, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -27.2716064453125, "rewards/margins": 25.582565307617188, "rewards/rejected": -52.85417556762695, "step": 1020 }, { "epoch": 0.20215897939156036, "grad_norm": 76.89183096947703, "learning_rate": 4.842985965116987e-07, "logits/chosen": -2.554537057876587, "logits/rejected": -2.2399966716766357, "logps/chosen": -360.2703857421875, "logps/rejected": -343.6308898925781, "loss": 0.8788, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -19.235864639282227, "rewards/margins": 42.635780334472656, "rewards/rejected": -61.87163543701172, "step": 1030 }, { "epoch": 0.2041216879293425, "grad_norm": 8404.650043143132, "learning_rate": 4.836956033443253e-07, "logits/chosen": -2.6381564140319824, "logits/rejected": -2.3341073989868164, "logps/chosen": -358.1074523925781, "logps/rejected": -400.53143310546875, "loss": 1.3652, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 9.0745210647583, "rewards/margins": 28.4394588470459, "rewards/rejected": -19.364938735961914, "step": 1040 }, { "epoch": 0.20608439646712462, "grad_norm": 80.66330563886407, "learning_rate": 4.830816385784104e-07, "logits/chosen": -2.75312876701355, "logits/rejected": -2.495368480682373, "logps/chosen": -306.1630554199219, "logps/rejected": -303.66131591796875, "loss": 0.7965, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.665053367614746, "rewards/margins": 26.665075302124023, "rewards/rejected": -25.000019073486328, "step": 1050 }, { "epoch": 0.20804710500490678, "grad_norm": 65.0991396267574, "learning_rate": 4.824567310385226e-07, "logits/chosen": -2.6330349445343018, "logits/rejected": -2.3273844718933105, "logps/chosen": -310.6580505371094, "logps/rejected": -303.3229064941406, "loss": 0.7311, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 6.711798191070557, "rewards/margins": 47.98042297363281, "rewards/rejected": -41.26862335205078, "step": 1060 }, { "epoch": 0.2100098135426889, "grad_norm": 104.95807619792653, "learning_rate": 4.818209100629744e-07, "logits/chosen": -2.545565366744995, "logits/rejected": -2.3559556007385254, "logps/chosen": -240.33895874023438, "logps/rejected": -297.4998779296875, "loss": 0.7708, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 8.331080436706543, "rewards/margins": 34.901241302490234, "rewards/rejected": -26.57016372680664, "step": 1070 }, { "epoch": 0.21197252208047104, "grad_norm": 136.532291563123, "learning_rate": 4.81174205502445e-07, "logits/chosen": -2.0786688327789307, "logits/rejected": -1.9513747692108154, "logps/chosen": -232.86410522460938, "logps/rejected": -262.2410888671875, "loss": 0.7981, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -11.664804458618164, "rewards/margins": 39.04685592651367, "rewards/rejected": -50.7116584777832, "step": 1080 }, { "epoch": 0.2139352306182532, "grad_norm": 184.7688152609819, "learning_rate": 4.80516647718579e-07, "logits/chosen": -2.007607936859131, "logits/rejected": -1.554937481880188, "logps/chosen": -254.9839324951172, "logps/rejected": -346.6573181152344, "loss": 0.7641, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -15.211018562316895, "rewards/margins": 77.2526626586914, "rewards/rejected": -92.46367645263672, "step": 1090 }, { "epoch": 0.21589793915603533, "grad_norm": 69.66871328203887, "learning_rate": 4.798482675825602e-07, "logits/chosen": -2.2417311668395996, "logits/rejected": -2.0823099613189697, "logps/chosen": -204.16978454589844, "logps/rejected": -293.24285888671875, "loss": 0.8532, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.027112007141113, "rewards/margins": 49.02958679199219, "rewards/rejected": -45.002471923828125, "step": 1100 }, { "epoch": 0.21786064769381747, "grad_norm": 95.00486251214005, "learning_rate": 4.791690964736636e-07, "logits/chosen": -2.4260010719299316, "logits/rejected": -2.0604798793792725, "logps/chosen": -283.0420837402344, "logps/rejected": -258.2274475097656, "loss": 0.7877, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 28.72829246520996, "rewards/margins": 71.94486999511719, "rewards/rejected": -43.21657943725586, "step": 1110 }, { "epoch": 0.2198233562315996, "grad_norm": 79.67497321184206, "learning_rate": 4.78479166277781e-07, "logits/chosen": -2.568878412246704, "logits/rejected": -2.088095188140869, "logps/chosen": -333.7295837402344, "logps/rejected": -329.0635681152344, "loss": 0.7276, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5225930213928223, "rewards/margins": 41.53412628173828, "rewards/rejected": -44.05671310424805, "step": 1120 }, { "epoch": 0.22178606476938176, "grad_norm": 122.16764617937987, "learning_rate": 4.777785093859247e-07, "logits/chosen": -2.6671195030212402, "logits/rejected": -1.9225339889526367, "logps/chosen": -293.151611328125, "logps/rejected": -377.86529541015625, "loss": 1.2967, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -8.485434532165527, "rewards/margins": 77.22737121582031, "rewards/rejected": -85.71280670166016, "step": 1130 }, { "epoch": 0.2237487733071639, "grad_norm": 117.82973467011566, "learning_rate": 4.770671586927063e-07, "logits/chosen": -2.7846384048461914, "logits/rejected": -2.538395404815674, "logps/chosen": -353.89129638671875, "logps/rejected": -358.82928466796875, "loss": 0.742, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 2.9781851768493652, "rewards/margins": 33.23398971557617, "rewards/rejected": -30.255807876586914, "step": 1140 }, { "epoch": 0.22571148184494602, "grad_norm": 101.61056689581227, "learning_rate": 4.7634514759479275e-07, "logits/chosen": -2.793891668319702, "logits/rejected": -2.3743512630462646, "logps/chosen": -276.8369445800781, "logps/rejected": -291.7186279296875, "loss": 0.7049, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 8.823906898498535, "rewards/margins": 79.89847564697266, "rewards/rejected": -71.07456970214844, "step": 1150 }, { "epoch": 0.22767419038272815, "grad_norm": 72.67792938735357, "learning_rate": 4.7561250998933835e-07, "logits/chosen": -2.3937525749206543, "logits/rejected": -1.9683380126953125, "logps/chosen": -347.2264709472656, "logps/rejected": -277.95770263671875, "loss": 0.7523, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -2.9054484367370605, "rewards/margins": 60.65302658081055, "rewards/rejected": -63.5584716796875, "step": 1160 }, { "epoch": 0.2296368989205103, "grad_norm": 65.07969809044923, "learning_rate": 4.7486928027239304e-07, "logits/chosen": -2.2467610836029053, "logits/rejected": -1.9916776418685913, "logps/chosen": -204.47714233398438, "logps/rejected": -266.11297607421875, "loss": 0.8108, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 1.171684980392456, "rewards/margins": 55.38752365112305, "rewards/rejected": -54.215843200683594, "step": 1170 }, { "epoch": 0.23159960745829244, "grad_norm": 79.79279922361413, "learning_rate": 4.7411549333728807e-07, "logits/chosen": -2.3492677211761475, "logits/rejected": -2.328451633453369, "logps/chosen": -308.316650390625, "logps/rejected": -328.06988525390625, "loss": 0.8285, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -29.842716217041016, "rewards/margins": 13.971054077148438, "rewards/rejected": -43.81376647949219, "step": 1180 }, { "epoch": 0.23356231599607458, "grad_norm": 59.37661931766882, "learning_rate": 4.7335118457299756e-07, "logits/chosen": -2.700810432434082, "logits/rejected": -2.4827983379364014, "logps/chosen": -277.98974609375, "logps/rejected": -297.0730895996094, "loss": 0.8001, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 9.98155689239502, "rewards/margins": 45.6263427734375, "rewards/rejected": -35.6447868347168, "step": 1190 }, { "epoch": 0.23552502453385674, "grad_norm": 66.06096356303384, "learning_rate": 4.7257638986247684e-07, "logits/chosen": -2.6501240730285645, "logits/rejected": -2.4950852394104004, "logps/chosen": -293.5074462890625, "logps/rejected": -389.7607421875, "loss": 0.8337, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 3.6907737255096436, "rewards/margins": 48.39918899536133, "rewards/rejected": -44.70841598510742, "step": 1200 }, { "epoch": 0.23748773307163887, "grad_norm": 79.05782848580151, "learning_rate": 4.7179114558097814e-07, "logits/chosen": -2.467741012573242, "logits/rejected": -2.1841864585876465, "logps/chosen": -251.1241455078125, "logps/rejected": -233.6212158203125, "loss": 0.767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 8.178865432739258, "rewards/margins": 36.897918701171875, "rewards/rejected": -28.71905517578125, "step": 1210 }, { "epoch": 0.239450441609421, "grad_norm": 69.82498939026273, "learning_rate": 4.709954885943428e-07, "logits/chosen": -2.4487500190734863, "logits/rejected": -2.2399919033050537, "logps/chosen": -275.9875793457031, "logps/rejected": -252.0728759765625, "loss": 0.8944, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -3.241147994995117, "rewards/margins": 31.412546157836914, "rewards/rejected": -34.65369415283203, "step": 1220 }, { "epoch": 0.24141315014720313, "grad_norm": 69.09830209737727, "learning_rate": 4.7018945625727026e-07, "logits/chosen": -2.7463760375976562, "logits/rejected": -2.5719149112701416, "logps/chosen": -289.55145263671875, "logps/rejected": -291.48736572265625, "loss": 0.8644, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -11.859620094299316, "rewards/margins": 10.369027137756348, "rewards/rejected": -22.228647232055664, "step": 1230 }, { "epoch": 0.2433758586849853, "grad_norm": 63.69392368063129, "learning_rate": 4.6937308641156447e-07, "logits/chosen": -2.6999282836914062, "logits/rejected": -2.6278529167175293, "logps/chosen": -204.9237060546875, "logps/rejected": -232.798828125, "loss": 0.7994, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 5.174790382385254, "rewards/margins": 25.665557861328125, "rewards/rejected": -20.490766525268555, "step": 1240 }, { "epoch": 0.24533856722276742, "grad_norm": 90.46664706018922, "learning_rate": 4.685464173843574e-07, "logits/chosen": -2.6145846843719482, "logits/rejected": -2.6193227767944336, "logps/chosen": -230.6596221923828, "logps/rejected": -283.3306579589844, "loss": 0.8066, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 3.331930160522461, "rewards/margins": 31.35042381286621, "rewards/rejected": -28.018497467041016, "step": 1250 }, { "epoch": 0.24730127576054955, "grad_norm": 114.85503360960028, "learning_rate": 4.677094879863093e-07, "logits/chosen": -2.633307456970215, "logits/rejected": -2.2867069244384766, "logps/chosen": -250.37411499023438, "logps/rejected": -268.84796142578125, "loss": 0.7707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.964310884475708, "rewards/margins": 50.154048919677734, "rewards/rejected": -52.11835861206055, "step": 1260 }, { "epoch": 0.2492639842983317, "grad_norm": 69.86561903798685, "learning_rate": 4.66862337509787e-07, "logits/chosen": -2.7894675731658936, "logits/rejected": -2.508972644805908, "logps/chosen": -310.96435546875, "logps/rejected": -264.95904541015625, "loss": 0.8725, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 9.859870910644531, "rewards/margins": 54.62556076049805, "rewards/rejected": -44.765689849853516, "step": 1270 }, { "epoch": 0.2512266928361138, "grad_norm": 100.49406990713037, "learning_rate": 4.660050057270191e-07, "logits/chosen": -2.6781601905822754, "logits/rejected": -2.0707125663757324, "logps/chosen": -247.5864715576172, "logps/rejected": -270.4193420410156, "loss": 0.7601, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 8.501138687133789, "rewards/margins": 64.88340759277344, "rewards/rejected": -56.38227462768555, "step": 1280 }, { "epoch": 0.25318940137389595, "grad_norm": 100.57375598829702, "learning_rate": 4.6513753288822833e-07, "logits/chosen": -2.654413938522339, "logits/rejected": -2.2316246032714844, "logps/chosen": -156.7413330078125, "logps/rejected": -222.46182250976562, "loss": 0.8087, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.560234308242798, "rewards/margins": 52.28660202026367, "rewards/rejected": -49.72637176513672, "step": 1290 }, { "epoch": 0.25515210991167814, "grad_norm": 103.96489879956751, "learning_rate": 4.6425995971974265e-07, "logits/chosen": -2.5510106086730957, "logits/rejected": -2.102264404296875, "logps/chosen": -298.84246826171875, "logps/rejected": -271.9053955078125, "loss": 0.8927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.558160781860352, "rewards/margins": 39.32708740234375, "rewards/rejected": -54.88524627685547, "step": 1300 }, { "epoch": 0.25711481844946027, "grad_norm": 73.0530969272212, "learning_rate": 4.633723274220824e-07, "logits/chosen": -2.760584592819214, "logits/rejected": -2.485985279083252, "logps/chosen": -305.2301940917969, "logps/rejected": -376.43707275390625, "loss": 0.8051, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8.29564094543457, "rewards/margins": 44.65286636352539, "rewards/rejected": -52.94850540161133, "step": 1310 }, { "epoch": 0.2590775269872424, "grad_norm": 82.97504691585438, "learning_rate": 4.624746776680267e-07, "logits/chosen": -2.41534161567688, "logits/rejected": -2.352321147918701, "logps/chosen": -260.76361083984375, "logps/rejected": -288.9218444824219, "loss": 0.7943, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -12.183237075805664, "rewards/margins": 30.011730194091797, "rewards/rejected": -42.19496536254883, "step": 1320 }, { "epoch": 0.26104023552502453, "grad_norm": 62.79578893027463, "learning_rate": 4.6156705260065634e-07, "logits/chosen": -2.4285244941711426, "logits/rejected": -2.3893349170684814, "logps/chosen": -206.4307098388672, "logps/rejected": -264.3067321777344, "loss": 0.8304, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 2.7431342601776123, "rewards/margins": 41.91610336303711, "rewards/rejected": -39.172969818115234, "step": 1330 }, { "epoch": 0.26300294406280667, "grad_norm": 77.60045235515152, "learning_rate": 4.606494948313758e-07, "logits/chosen": -2.6037545204162598, "logits/rejected": -2.285953998565674, "logps/chosen": -245.8831787109375, "logps/rejected": -284.19744873046875, "loss": 0.778, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -3.7715911865234375, "rewards/margins": 24.767900466918945, "rewards/rejected": -28.53949546813965, "step": 1340 }, { "epoch": 0.2649656526005888, "grad_norm": 68.96144509934302, "learning_rate": 4.597220474379125e-07, "logits/chosen": -2.5462422370910645, "logits/rejected": -2.635493755340576, "logps/chosen": -338.59820556640625, "logps/rejected": -362.95416259765625, "loss": 0.903, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -11.323179244995117, "rewards/margins": 28.475255966186523, "rewards/rejected": -39.79842758178711, "step": 1350 }, { "epoch": 0.26692836113837093, "grad_norm": 94.07556196834392, "learning_rate": 4.587847539622942e-07, "logits/chosen": -2.438211441040039, "logits/rejected": -2.4068198204040527, "logps/chosen": -380.3414611816406, "logps/rejected": -356.20404052734375, "loss": 0.6732, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -2.1833744049072266, "rewards/margins": 30.749645233154297, "rewards/rejected": -32.933021545410156, "step": 1360 }, { "epoch": 0.2688910696761531, "grad_norm": 130.05058432574347, "learning_rate": 4.5783765840880505e-07, "logits/chosen": -2.4395289421081543, "logits/rejected": -2.206676483154297, "logps/chosen": -327.61309814453125, "logps/rejected": -387.9151306152344, "loss": 0.8196, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.41328972578048706, "rewards/margins": 81.57083129882812, "rewards/rejected": -81.15753173828125, "step": 1370 }, { "epoch": 0.27085377821393525, "grad_norm": 137.38427651474393, "learning_rate": 4.568808052419196e-07, "logits/chosen": -2.3273987770080566, "logits/rejected": -2.136598587036133, "logps/chosen": -226.7196502685547, "logps/rejected": -289.2423400878906, "loss": 0.7434, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -2.1149885654449463, "rewards/margins": 69.53654479980469, "rewards/rejected": -71.65152740478516, "step": 1380 }, { "epoch": 0.2728164867517174, "grad_norm": 64.35613492436532, "learning_rate": 4.5591423938421513e-07, "logits/chosen": -2.4769833087921143, "logits/rejected": -2.290013551712036, "logps/chosen": -293.5498046875, "logps/rejected": -303.51495361328125, "loss": 0.815, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -9.683921813964844, "rewards/margins": 35.344207763671875, "rewards/rejected": -45.028133392333984, "step": 1390 }, { "epoch": 0.2747791952894995, "grad_norm": 63.703854209480724, "learning_rate": 4.549380062142627e-07, "logits/chosen": -2.5791192054748535, "logits/rejected": -2.526895523071289, "logps/chosen": -269.97515869140625, "logps/rejected": -320.694580078125, "loss": 0.8424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.466176986694336, "rewards/margins": 17.470767974853516, "rewards/rejected": -26.936946868896484, "step": 1400 }, { "epoch": 0.27674190382728164, "grad_norm": 124.6235038207521, "learning_rate": 4.5395215156449683e-07, "logits/chosen": -2.523416757583618, "logits/rejected": -2.53853440284729, "logps/chosen": -302.0522766113281, "logps/rejected": -389.78546142578125, "loss": 0.7314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8244163393974304, "rewards/margins": 41.51442337036133, "rewards/rejected": -40.69001007080078, "step": 1410 }, { "epoch": 0.2787046123650638, "grad_norm": 101.615348155234, "learning_rate": 4.5295672171906365e-07, "logits/chosen": -2.5056309700012207, "logits/rejected": -2.2287116050720215, "logps/chosen": -268.10992431640625, "logps/rejected": -270.0954895019531, "loss": 0.7739, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -2.6450417041778564, "rewards/margins": 39.11285400390625, "rewards/rejected": -41.757896423339844, "step": 1420 }, { "epoch": 0.2806673209028459, "grad_norm": 93.45651681650695, "learning_rate": 4.5195176341164765e-07, "logits/chosen": -2.49881649017334, "logits/rejected": -2.2419865131378174, "logps/chosen": -277.0741271972656, "logps/rejected": -355.82232666015625, "loss": 0.8458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.585633277893066, "rewards/margins": 52.39763641357422, "rewards/rejected": -60.98326873779297, "step": 1430 }, { "epoch": 0.2826300294406281, "grad_norm": 80.74339965228211, "learning_rate": 4.509373238232782e-07, "logits/chosen": -2.240375518798828, "logits/rejected": -2.1151089668273926, "logps/chosen": -305.4589538574219, "logps/rejected": -272.02667236328125, "loss": 0.765, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -6.646783351898193, "rewards/margins": 31.581079483032227, "rewards/rejected": -38.22786331176758, "step": 1440 }, { "epoch": 0.2845927379784102, "grad_norm": 95.65982027392724, "learning_rate": 4.499134505801141e-07, "logits/chosen": -2.345745801925659, "logits/rejected": -2.111720323562622, "logps/chosen": -227.3837127685547, "logps/rejected": -314.46832275390625, "loss": 0.7606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6170268058776855, "rewards/margins": 65.8896484375, "rewards/rejected": -62.272621154785156, "step": 1450 }, { "epoch": 0.28655544651619236, "grad_norm": 74.11839149728634, "learning_rate": 4.488801917512076e-07, "logits/chosen": -2.352388381958008, "logits/rejected": -2.520425319671631, "logps/chosen": -315.57464599609375, "logps/rejected": -375.50128173828125, "loss": 0.9394, "rewards/accuracies": 0.5, "rewards/chosen": -26.152206420898438, "rewards/margins": 12.819509506225586, "rewards/rejected": -38.971717834472656, "step": 1460 }, { "epoch": 0.2885181550539745, "grad_norm": 75.88122332266589, "learning_rate": 4.478375958462479e-07, "logits/chosen": -2.625180721282959, "logits/rejected": -2.2984683513641357, "logps/chosen": -309.5433349609375, "logps/rejected": -284.91552734375, "loss": 0.8309, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -10.632726669311523, "rewards/margins": 48.80922317504883, "rewards/rejected": -59.44194412231445, "step": 1470 }, { "epoch": 0.2904808635917566, "grad_norm": 77.60876590830262, "learning_rate": 4.467857118132833e-07, "logits/chosen": -2.642885684967041, "logits/rejected": -2.6552321910858154, "logps/chosen": -250.0420684814453, "logps/rejected": -267.0228271484375, "loss": 0.8327, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -3.1296164989471436, "rewards/margins": 19.802265167236328, "rewards/rejected": -22.931880950927734, "step": 1480 }, { "epoch": 0.29244357212953875, "grad_norm": 78.54925961147342, "learning_rate": 4.457245890364235e-07, "logits/chosen": -2.804276466369629, "logits/rejected": -2.5029938220977783, "logps/chosen": -338.68621826171875, "logps/rejected": -293.933837890625, "loss": 0.866, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 10.880395889282227, "rewards/margins": 40.769683837890625, "rewards/rejected": -29.8892879486084, "step": 1490 }, { "epoch": 0.2944062806673209, "grad_norm": 78.87610771063265, "learning_rate": 4.4465427733352124e-07, "logits/chosen": -2.519253730773926, "logits/rejected": -2.283416509628296, "logps/chosen": -279.54559326171875, "logps/rejected": -321.074462890625, "loss": 0.7123, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 3.099043607711792, "rewards/margins": 51.63875198364258, "rewards/rejected": -48.53971481323242, "step": 1500 }, { "epoch": 0.296368989205103, "grad_norm": 315.8333072505281, "learning_rate": 4.43574826953833e-07, "logits/chosen": -2.3678691387176514, "logits/rejected": -2.0554909706115723, "logps/chosen": -308.0277404785156, "logps/rejected": -393.2145080566406, "loss": 0.7979, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.312995433807373, "rewards/margins": 70.61099243164062, "rewards/rejected": -67.2979965209961, "step": 1510 }, { "epoch": 0.2983316977428852, "grad_norm": 72.07193809253059, "learning_rate": 4.4248628857565997e-07, "logits/chosen": -2.221647262573242, "logits/rejected": -1.8691152334213257, "logps/chosen": -335.20184326171875, "logps/rejected": -267.240966796875, "loss": 0.7648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1237494945526123, "rewards/margins": 55.95257568359375, "rewards/rejected": -54.828834533691406, "step": 1520 }, { "epoch": 0.30029440628066734, "grad_norm": 88.46091873446755, "learning_rate": 4.413887133039692e-07, "logits/chosen": -2.6441397666931152, "logits/rejected": -1.8982112407684326, "logps/chosen": -409.7470703125, "logps/rejected": -336.4615478515625, "loss": 0.7529, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -0.1140478104352951, "rewards/margins": 61.70000457763672, "rewards/rejected": -61.814056396484375, "step": 1530 }, { "epoch": 0.30225711481844947, "grad_norm": 76.34058048847936, "learning_rate": 4.4028215266799395e-07, "logits/chosen": -2.56492280960083, "logits/rejected": -2.0929627418518066, "logps/chosen": -266.294921875, "logps/rejected": -282.0745544433594, "loss": 0.729, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 6.829888820648193, "rewards/margins": 67.3569107055664, "rewards/rejected": -60.527015686035156, "step": 1540 }, { "epoch": 0.3042198233562316, "grad_norm": 122.77230031865297, "learning_rate": 4.391666586188145e-07, "logits/chosen": -2.0806310176849365, "logits/rejected": -1.7542095184326172, "logps/chosen": -210.2320098876953, "logps/rejected": -272.1443176269531, "loss": 0.7451, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 2.466217041015625, "rewards/margins": 48.363460540771484, "rewards/rejected": -45.89724349975586, "step": 1550 }, { "epoch": 0.30618253189401373, "grad_norm": 88.70763520673025, "learning_rate": 4.380422835269193e-07, "logits/chosen": -2.098982572555542, "logits/rejected": -1.8768548965454102, "logps/chosen": -265.6413269042969, "logps/rejected": -319.20672607421875, "loss": 0.8241, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -11.66611099243164, "rewards/margins": 41.30660629272461, "rewards/rejected": -52.97271728515625, "step": 1560 }, { "epoch": 0.30814524043179586, "grad_norm": 99.34980534985873, "learning_rate": 4.3690908017974596e-07, "logits/chosen": -2.027304172515869, "logits/rejected": -1.8482277393341064, "logps/chosen": -233.01089477539062, "logps/rejected": -332.70245361328125, "loss": 0.8457, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -7.273612976074219, "rewards/margins": 52.13492965698242, "rewards/rejected": -59.408546447753906, "step": 1570 }, { "epoch": 0.310107948969578, "grad_norm": 75.64619313952885, "learning_rate": 4.3576710177920356e-07, "logits/chosen": -2.0713438987731934, "logits/rejected": -1.7619683742523193, "logps/chosen": -260.17523193359375, "logps/rejected": -299.4717712402344, "loss": 0.8172, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -23.76706886291504, "rewards/margins": 45.05198669433594, "rewards/rejected": -68.81905364990234, "step": 1580 }, { "epoch": 0.3120706575073602, "grad_norm": 128.82206162085666, "learning_rate": 4.346164019391742e-07, "logits/chosen": -2.363157272338867, "logits/rejected": -1.7725169658660889, "logps/chosen": -360.8402099609375, "logps/rejected": -399.99041748046875, "loss": 0.7845, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4936651289463043, "rewards/margins": 67.40812683105469, "rewards/rejected": -66.91445922851562, "step": 1590 }, { "epoch": 0.3140333660451423, "grad_norm": 107.19900181246759, "learning_rate": 4.3345703468299634e-07, "logits/chosen": -2.287259101867676, "logits/rejected": -2.0964293479919434, "logps/chosen": -294.94305419921875, "logps/rejected": -317.75567626953125, "loss": 0.8989, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -17.214643478393555, "rewards/margins": 29.66391372680664, "rewards/rejected": -46.87855529785156, "step": 1600 }, { "epoch": 0.31599607458292445, "grad_norm": 100.04289419483206, "learning_rate": 4.322890544409286e-07, "logits/chosen": -2.672071933746338, "logits/rejected": -1.937684416770935, "logps/chosen": -303.5447692871094, "logps/rejected": -324.1817321777344, "loss": 0.6239, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 10.904064178466797, "rewards/margins": 76.38121032714844, "rewards/rejected": -65.47714233398438, "step": 1610 }, { "epoch": 0.3179587831207066, "grad_norm": 87.43845967943851, "learning_rate": 4.311125160475938e-07, "logits/chosen": -2.3708949089050293, "logits/rejected": -2.3977978229522705, "logps/chosen": -275.3824462890625, "logps/rejected": -394.8155822753906, "loss": 0.7203, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3.281172275543213, "rewards/margins": 39.0241813659668, "rewards/rejected": -42.30535125732422, "step": 1620 }, { "epoch": 0.3199214916584887, "grad_norm": 139.7124642106334, "learning_rate": 4.299274747394055e-07, "logits/chosen": -2.1663546562194824, "logits/rejected": -1.8903602361679077, "logps/chosen": -264.494873046875, "logps/rejected": -319.666259765625, "loss": 0.6745, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.870364785194397, "rewards/margins": 79.11808776855469, "rewards/rejected": -77.24771881103516, "step": 1630 }, { "epoch": 0.32188420019627084, "grad_norm": 104.50787428493565, "learning_rate": 4.287339861519737e-07, "logits/chosen": -2.360588788986206, "logits/rejected": -2.1207916736602783, "logps/chosen": -303.96710205078125, "logps/rejected": -344.24920654296875, "loss": 0.7933, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -4.274575233459473, "rewards/margins": 57.10742950439453, "rewards/rejected": -61.38199996948242, "step": 1640 }, { "epoch": 0.323846908734053, "grad_norm": 109.43626421863449, "learning_rate": 4.275321063174936e-07, "logits/chosen": -2.5362002849578857, "logits/rejected": -2.434088945388794, "logps/chosen": -340.662353515625, "logps/rejected": -296.8583679199219, "loss": 0.7737, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -4.014067649841309, "rewards/margins": 37.383609771728516, "rewards/rejected": -41.397674560546875, "step": 1650 }, { "epoch": 0.3258096172718351, "grad_norm": 63.727579386159455, "learning_rate": 4.2632189166211454e-07, "logits/chosen": -2.3239457607269287, "logits/rejected": -2.4812355041503906, "logps/chosen": -238.8297576904297, "logps/rejected": -299.589599609375, "loss": 0.7659, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -1.3575565814971924, "rewards/margins": 34.25685119628906, "rewards/rejected": -35.614410400390625, "step": 1660 }, { "epoch": 0.3277723258096173, "grad_norm": 726.9430681291926, "learning_rate": 4.251033990032912e-07, "logits/chosen": -2.3445041179656982, "logits/rejected": -2.0732221603393555, "logps/chosen": -303.9436340332031, "logps/rejected": -392.46575927734375, "loss": 0.9339, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -6.846814155578613, "rewards/margins": 81.260986328125, "rewards/rejected": -88.10780334472656, "step": 1670 }, { "epoch": 0.3297350343473994, "grad_norm": 73.85181646748991, "learning_rate": 4.238766855471161e-07, "logits/chosen": -2.3831021785736084, "logits/rejected": -2.153560161590576, "logps/chosen": -351.2648010253906, "logps/rejected": -295.6867980957031, "loss": 0.7584, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4.382540702819824, "rewards/margins": 56.24854278564453, "rewards/rejected": -60.631080627441406, "step": 1680 }, { "epoch": 0.33169774288518156, "grad_norm": 78.59735486387085, "learning_rate": 4.226418088856335e-07, "logits/chosen": -2.4451980590820312, "logits/rejected": -2.2495720386505127, "logps/chosen": -259.4072265625, "logps/rejected": -396.7475891113281, "loss": 0.7717, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.996706247329712, "rewards/margins": 66.77244567871094, "rewards/rejected": -64.77574157714844, "step": 1690 }, { "epoch": 0.3336604514229637, "grad_norm": 142.8024728633196, "learning_rate": 4.2139882699413613e-07, "logits/chosen": -2.649484157562256, "logits/rejected": -2.0010745525360107, "logps/chosen": -282.3056945800781, "logps/rejected": -252.3628387451172, "loss": 0.6975, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -10.299084663391113, "rewards/margins": 51.43207931518555, "rewards/rejected": -61.731163024902344, "step": 1700 }, { "epoch": 0.3356231599607458, "grad_norm": 96.69157474694374, "learning_rate": 4.2014779822844274e-07, "logits/chosen": -2.443678140640259, "logits/rejected": -2.1429636478424072, "logps/chosen": -249.7233123779297, "logps/rejected": -327.57891845703125, "loss": 0.7922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.686555862426758, "rewards/margins": 51.78627395629883, "rewards/rejected": -71.47282409667969, "step": 1710 }, { "epoch": 0.33758586849852795, "grad_norm": 140.59164330489955, "learning_rate": 4.18888781322159e-07, "logits/chosen": -2.5568032264709473, "logits/rejected": -1.9281814098358154, "logps/chosen": -251.53561401367188, "logps/rejected": -339.05340576171875, "loss": 0.645, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -4.498878479003906, "rewards/margins": 77.38370513916016, "rewards/rejected": -81.88258361816406, "step": 1720 }, { "epoch": 0.3395485770363101, "grad_norm": 105.02484733671626, "learning_rate": 4.176218353839195e-07, "logits/chosen": -2.4342472553253174, "logits/rejected": -2.203434705734253, "logps/chosen": -273.43048095703125, "logps/rejected": -278.9997863769531, "loss": 0.8072, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -11.107572555541992, "rewards/margins": 57.830665588378906, "rewards/rejected": -68.93824768066406, "step": 1730 }, { "epoch": 0.34151128557409227, "grad_norm": 95.64380107039757, "learning_rate": 4.1634701989461325e-07, "logits/chosen": -2.2818996906280518, "logits/rejected": -1.9297301769256592, "logps/chosen": -269.7955017089844, "logps/rejected": -345.8044738769531, "loss": 0.7055, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.1849523782730103, "rewards/margins": 66.4659423828125, "rewards/rejected": -67.65090942382812, "step": 1740 }, { "epoch": 0.3434739941118744, "grad_norm": 96.0001277567152, "learning_rate": 4.1506439470459056e-07, "logits/chosen": -2.238967180252075, "logits/rejected": -1.9937868118286133, "logps/chosen": -242.6586456298828, "logps/rejected": -292.09356689453125, "loss": 0.7877, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -8.422876358032227, "rewards/margins": 62.2960319519043, "rewards/rejected": -70.71890258789062, "step": 1750 }, { "epoch": 0.34543670264965654, "grad_norm": 136.57070606363177, "learning_rate": 4.137740200308537e-07, "logits/chosen": -2.3446662425994873, "logits/rejected": -1.9477441310882568, "logps/chosen": -290.60186767578125, "logps/rejected": -307.41949462890625, "loss": 0.79, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.21036934852600098, "rewards/margins": 39.92430877685547, "rewards/rejected": -40.13467025756836, "step": 1760 }, { "epoch": 0.34739941118743867, "grad_norm": 88.8458403302276, "learning_rate": 4.124759564542295e-07, "logits/chosen": -2.276298999786377, "logits/rejected": -1.9049484729766846, "logps/chosen": -313.21923828125, "logps/rejected": -276.65179443359375, "loss": 0.8076, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -17.07889175415039, "rewards/margins": 46.8661003112793, "rewards/rejected": -63.94499588012695, "step": 1770 }, { "epoch": 0.3493621197252208, "grad_norm": 97.84528595875716, "learning_rate": 4.111702649165255e-07, "logits/chosen": -2.3248953819274902, "logits/rejected": -1.8017902374267578, "logps/chosen": -264.0790100097656, "logps/rejected": -277.5160217285156, "loss": 0.8278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.431878089904785, "rewards/margins": 49.67168426513672, "rewards/rejected": -62.10355758666992, "step": 1780 }, { "epoch": 0.35132482826300293, "grad_norm": 59.33630654350773, "learning_rate": 4.0985700671766834e-07, "logits/chosen": -2.427077054977417, "logits/rejected": -1.7638553380966187, "logps/chosen": -342.54522705078125, "logps/rejected": -340.16192626953125, "loss": 0.7236, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 12.516778945922852, "rewards/margins": 71.02592468261719, "rewards/rejected": -58.5091438293457, "step": 1790 }, { "epoch": 0.35328753680078506, "grad_norm": 89.76083306994288, "learning_rate": 4.085362435128262e-07, "logits/chosen": -2.3768677711486816, "logits/rejected": -1.6395857334136963, "logps/chosen": -292.1382751464844, "logps/rejected": -345.7778015136719, "loss": 0.7214, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.840475559234619, "rewards/margins": 90.30509948730469, "rewards/rejected": -95.14556884765625, "step": 1800 }, { "epoch": 0.35525024533856725, "grad_norm": 153.73077318064222, "learning_rate": 4.0720803730951423e-07, "logits/chosen": -2.7150356769561768, "logits/rejected": -2.0028915405273438, "logps/chosen": -312.307373046875, "logps/rejected": -256.82623291015625, "loss": 0.7594, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -2.1445558071136475, "rewards/margins": 61.59296798706055, "rewards/rejected": -63.737525939941406, "step": 1810 }, { "epoch": 0.3572129538763494, "grad_norm": 147.60872090580003, "learning_rate": 4.058724504646834e-07, "logits/chosen": -2.6029253005981445, "logits/rejected": -2.1005451679229736, "logps/chosen": -229.44210815429688, "logps/rejected": -276.8072814941406, "loss": 0.7005, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -0.6633428931236267, "rewards/margins": 48.877403259277344, "rewards/rejected": -49.540748596191406, "step": 1820 }, { "epoch": 0.3591756624141315, "grad_norm": 88.91550474133544, "learning_rate": 4.045295456817924e-07, "logits/chosen": -2.4975953102111816, "logits/rejected": -2.367603063583374, "logps/chosen": -291.25164794921875, "logps/rejected": -336.652587890625, "loss": 0.7352, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -14.37291431427002, "rewards/margins": 44.442222595214844, "rewards/rejected": -58.81513595581055, "step": 1830 }, { "epoch": 0.36113837095191365, "grad_norm": 93.12070855757399, "learning_rate": 4.0317938600786484e-07, "logits/chosen": -2.483926773071289, "logits/rejected": -2.2988216876983643, "logps/chosen": -313.47296142578125, "logps/rejected": -319.6980285644531, "loss": 0.8287, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -18.503278732299805, "rewards/margins": 25.62750816345215, "rewards/rejected": -44.13079071044922, "step": 1840 }, { "epoch": 0.3631010794896958, "grad_norm": 81.25944733335075, "learning_rate": 4.0182203483052825e-07, "logits/chosen": -2.4069361686706543, "logits/rejected": -2.103092908859253, "logps/chosen": -304.81439208984375, "logps/rejected": -274.3727111816406, "loss": 0.7443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.86172866821289, "rewards/margins": 44.73784637451172, "rewards/rejected": -68.59957122802734, "step": 1850 }, { "epoch": 0.3650637880274779, "grad_norm": 76.19538335552355, "learning_rate": 4.004575558750389e-07, "logits/chosen": -2.887248992919922, "logits/rejected": -2.0836079120635986, "logps/chosen": -358.33245849609375, "logps/rejected": -355.9374694824219, "loss": 0.7662, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -7.720302581787109, "rewards/margins": 46.76068115234375, "rewards/rejected": -54.480979919433594, "step": 1860 }, { "epoch": 0.36702649656526004, "grad_norm": 88.25519947399084, "learning_rate": 3.9908601320128976e-07, "logits/chosen": -2.2543537616729736, "logits/rejected": -1.8512779474258423, "logps/chosen": -253.5527801513672, "logps/rejected": -318.8297424316406, "loss": 0.8846, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -16.1134033203125, "rewards/margins": 60.856163024902344, "rewards/rejected": -76.96956634521484, "step": 1870 }, { "epoch": 0.3689892051030422, "grad_norm": 140.68956690048748, "learning_rate": 3.9770747120080284e-07, "logits/chosen": -2.5704903602600098, "logits/rejected": -2.337005853652954, "logps/chosen": -231.7997283935547, "logps/rejected": -270.393310546875, "loss": 0.7908, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.261826515197754, "rewards/margins": 54.714195251464844, "rewards/rejected": -60.97602081298828, "step": 1880 }, { "epoch": 0.37095191364082436, "grad_norm": 69.4450623896453, "learning_rate": 3.963219945937063e-07, "logits/chosen": -2.833038806915283, "logits/rejected": -2.3672146797180176, "logps/chosen": -228.93789672851562, "logps/rejected": -255.3886260986328, "loss": 0.8062, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 4.233958721160889, "rewards/margins": 33.97795867919922, "rewards/rejected": -29.743999481201172, "step": 1890 }, { "epoch": 0.3729146221786065, "grad_norm": 947.9738734221075, "learning_rate": 3.949296484256959e-07, "logits/chosen": -2.866281032562256, "logits/rejected": -2.4184844493865967, "logps/chosen": -261.49560546875, "logps/rejected": -287.2271423339844, "loss": 0.8301, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6.29120397567749, "rewards/margins": 37.32862854003906, "rewards/rejected": -43.61983871459961, "step": 1900 }, { "epoch": 0.3748773307163886, "grad_norm": 63.86552007946855, "learning_rate": 3.935304980649813e-07, "logits/chosen": -2.6993985176086426, "logits/rejected": -2.1234941482543945, "logps/chosen": -306.4438171386719, "logps/rejected": -355.7987976074219, "loss": 0.8267, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -6.36506986618042, "rewards/margins": 70.15254211425781, "rewards/rejected": -76.51761627197266, "step": 1910 }, { "epoch": 0.37684003925417076, "grad_norm": 59.06352220636853, "learning_rate": 3.92124609199217e-07, "logits/chosen": -2.6890501976013184, "logits/rejected": -2.459531307220459, "logps/chosen": -201.2141876220703, "logps/rejected": -254.01620483398438, "loss": 0.7191, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 3.0603458881378174, "rewards/margins": 39.81940460205078, "rewards/rejected": -36.759056091308594, "step": 1920 }, { "epoch": 0.3788027477919529, "grad_norm": 63.666631906687456, "learning_rate": 3.907120478324185e-07, "logits/chosen": -2.8293230533599854, "logits/rejected": -2.4793219566345215, "logps/chosen": -298.23114013671875, "logps/rejected": -309.5526428222656, "loss": 0.8063, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -8.38193416595459, "rewards/margins": 30.225805282592773, "rewards/rejected": -38.60773849487305, "step": 1930 }, { "epoch": 0.380765456329735, "grad_norm": 104.5224173509673, "learning_rate": 3.8929288028186364e-07, "logits/chosen": -2.9245762825012207, "logits/rejected": -2.5311379432678223, "logps/chosen": -219.61868286132812, "logps/rejected": -236.66757202148438, "loss": 0.6858, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.629956066608429, "rewards/margins": 39.0811767578125, "rewards/rejected": -39.71113204956055, "step": 1940 }, { "epoch": 0.38272816486751715, "grad_norm": 85.56284474184923, "learning_rate": 3.8786717317497875e-07, "logits/chosen": -2.640650510787964, "logits/rejected": -2.3510475158691406, "logps/chosen": -317.9199523925781, "logps/rejected": -331.4421691894531, "loss": 0.7293, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -4.25482702255249, "rewards/margins": 56.81431198120117, "rewards/rejected": -61.06914138793945, "step": 1950 }, { "epoch": 0.38469087340529934, "grad_norm": 80.1753764362152, "learning_rate": 3.864349934462111e-07, "logits/chosen": -2.6217799186706543, "logits/rejected": -2.1534552574157715, "logps/chosen": -265.29925537109375, "logps/rejected": -316.73583984375, "loss": 0.6642, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": -2.8556511402130127, "rewards/margins": 68.94566345214844, "rewards/rejected": -71.80131530761719, "step": 1960 }, { "epoch": 0.38665358194308147, "grad_norm": 99.20021473307554, "learning_rate": 3.84996408333886e-07, "logits/chosen": -2.6990816593170166, "logits/rejected": -1.8822605609893799, "logps/chosen": -324.51385498046875, "logps/rejected": -312.45556640625, "loss": 0.7814, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -18.279890060424805, "rewards/margins": 72.79891204833984, "rewards/rejected": -91.07878875732422, "step": 1970 }, { "epoch": 0.3886162904808636, "grad_norm": 79.48327918177439, "learning_rate": 3.8355148537705047e-07, "logits/chosen": -2.4228298664093018, "logits/rejected": -2.188333034515381, "logps/chosen": -243.294677734375, "logps/rejected": -276.08636474609375, "loss": 0.7327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.645560264587402, "rewards/margins": 58.80458450317383, "rewards/rejected": -71.45014953613281, "step": 1980 }, { "epoch": 0.39057899901864573, "grad_norm": 111.0413557832772, "learning_rate": 3.8210029241230204e-07, "logits/chosen": -2.622675895690918, "logits/rejected": -2.026853084564209, "logps/chosen": -360.7319030761719, "logps/rejected": -353.36163330078125, "loss": 0.7884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -12.795580863952637, "rewards/margins": 56.804588317871094, "rewards/rejected": -69.60017395019531, "step": 1990 }, { "epoch": 0.39254170755642787, "grad_norm": 68.77866455450156, "learning_rate": 3.806428975706042e-07, "logits/chosen": -2.198331832885742, "logits/rejected": -1.8165390491485596, "logps/chosen": -242.2079620361328, "logps/rejected": -281.45330810546875, "loss": 0.7855, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7.758261680603027, "rewards/margins": 44.08034133911133, "rewards/rejected": -51.838600158691406, "step": 2000 }, { "epoch": 0.39450441609421, "grad_norm": 76.63205663184101, "learning_rate": 3.791793692740876e-07, "logits/chosen": -2.5789237022399902, "logits/rejected": -1.966534972190857, "logps/chosen": -231.5847930908203, "logps/rejected": -224.9019775390625, "loss": 0.7719, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.8760717511177063, "rewards/margins": 57.25469970703125, "rewards/rejected": -56.378623962402344, "step": 2010 }, { "epoch": 0.39646712463199213, "grad_norm": 74.53163786705147, "learning_rate": 3.777097762328381e-07, "logits/chosen": -2.468264102935791, "logits/rejected": -1.9381202459335327, "logps/chosen": -290.8273010253906, "logps/rejected": -307.20684814453125, "loss": 0.7651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.561423301696777, "rewards/margins": 52.12416458129883, "rewards/rejected": -46.56273651123047, "step": 2020 }, { "epoch": 0.39842983316977426, "grad_norm": 131.901150019632, "learning_rate": 3.762341874416702e-07, "logits/chosen": -2.2452383041381836, "logits/rejected": -1.173287272453308, "logps/chosen": -228.6553497314453, "logps/rejected": -242.8582763671875, "loss": 0.7133, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -1.1953849792480469, "rewards/margins": 77.95489501953125, "rewards/rejected": -79.15027618408203, "step": 2030 }, { "epoch": 0.40039254170755645, "grad_norm": 118.73356744730785, "learning_rate": 3.7475267217688896e-07, "logits/chosen": -2.0608322620391846, "logits/rejected": -1.9833694696426392, "logps/chosen": -219.80886840820312, "logps/rejected": -308.75628662109375, "loss": 0.7499, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -19.390256881713867, "rewards/margins": 36.53691101074219, "rewards/rejected": -55.927162170410156, "step": 2040 }, { "epoch": 0.4023552502453386, "grad_norm": 90.19862799614565, "learning_rate": 3.7326529999303633e-07, "logits/chosen": -1.8816215991973877, "logits/rejected": -1.6762107610702515, "logps/chosen": -232.3477020263672, "logps/rejected": -327.2183532714844, "loss": 0.8383, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -16.654117584228516, "rewards/margins": 53.1618537902832, "rewards/rejected": -69.81597900390625, "step": 2050 }, { "epoch": 0.4043179587831207, "grad_norm": 173.1650439779948, "learning_rate": 3.7177214071962684e-07, "logits/chosen": -2.1543967723846436, "logits/rejected": -1.9147236347198486, "logps/chosen": -258.921875, "logps/rejected": -365.42694091796875, "loss": 0.8972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.673360347747803, "rewards/margins": 43.73550033569336, "rewards/rejected": -50.40886688232422, "step": 2060 }, { "epoch": 0.40628066732090284, "grad_norm": 72.30632881660155, "learning_rate": 3.7027326445786835e-07, "logits/chosen": -2.560781478881836, "logits/rejected": -2.3070003986358643, "logps/chosen": -257.6041259765625, "logps/rejected": -271.6292724609375, "loss": 0.7997, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.9452606439590454, "rewards/margins": 32.820884704589844, "rewards/rejected": -33.766143798828125, "step": 2070 }, { "epoch": 0.408243375858685, "grad_norm": 190.54946605676633, "learning_rate": 3.6876874157737167e-07, "logits/chosen": -2.495839834213257, "logits/rejected": -2.3546650409698486, "logps/chosen": -271.22247314453125, "logps/rejected": -317.8890686035156, "loss": 0.7997, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -10.120208740234375, "rewards/margins": 23.467655181884766, "rewards/rejected": -33.58786392211914, "step": 2080 }, { "epoch": 0.4102060843964671, "grad_norm": 126.86812875031383, "learning_rate": 3.67258642712846e-07, "logits/chosen": -2.7649986743927, "logits/rejected": -2.4985971450805664, "logps/chosen": -260.84820556640625, "logps/rejected": -224.01742553710938, "loss": 0.7948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.377007961273193, "rewards/margins": 18.694726943969727, "rewards/rejected": -26.071735382080078, "step": 2090 }, { "epoch": 0.41216879293424924, "grad_norm": 100.32531417496463, "learning_rate": 3.6574303876078366e-07, "logits/chosen": -2.681678295135498, "logits/rejected": -2.274333953857422, "logps/chosen": -279.39874267578125, "logps/rejected": -318.3245849609375, "loss": 0.6946, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": 3.419281005859375, "rewards/margins": 48.134620666503906, "rewards/rejected": -44.71533966064453, "step": 2100 }, { "epoch": 0.4141315014720314, "grad_norm": 92.07287735796785, "learning_rate": 3.642220008761309e-07, "logits/chosen": -2.5852322578430176, "logits/rejected": -1.926807165145874, "logps/chosen": -323.15631103515625, "logps/rejected": -346.3016662597656, "loss": 0.6499, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 4.408158302307129, "rewards/margins": 63.764373779296875, "rewards/rejected": -59.3562126159668, "step": 2110 }, { "epoch": 0.41609421000981356, "grad_norm": 156.9350130255696, "learning_rate": 3.626956004689476e-07, "logits/chosen": -2.516247034072876, "logits/rejected": -1.830751657485962, "logps/chosen": -357.89642333984375, "logps/rejected": -268.8968200683594, "loss": 0.7336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.661848545074463, "rewards/margins": 57.362144470214844, "rewards/rejected": -53.70029830932617, "step": 2120 }, { "epoch": 0.4180569185475957, "grad_norm": 74.95601528879678, "learning_rate": 3.6116390920105474e-07, "logits/chosen": -2.614553213119507, "logits/rejected": -2.37044095993042, "logps/chosen": -271.95367431640625, "logps/rejected": -271.39788818359375, "loss": 0.7679, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -4.626086711883545, "rewards/margins": 31.782245635986328, "rewards/rejected": -36.4083366394043, "step": 2130 }, { "epoch": 0.4200196270853778, "grad_norm": 79.26406264591427, "learning_rate": 3.5962699898266983e-07, "logits/chosen": -2.6553640365600586, "logits/rejected": -2.316606044769287, "logps/chosen": -245.4364776611328, "logps/rejected": -254.887451171875, "loss": 0.7884, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -1.4198448657989502, "rewards/margins": 37.23655319213867, "rewards/rejected": -38.656394958496094, "step": 2140 }, { "epoch": 0.42198233562315995, "grad_norm": 69.39813900987257, "learning_rate": 3.5808494196903117e-07, "logits/chosen": -2.5761935710906982, "logits/rejected": -2.0720858573913574, "logps/chosen": -329.97552490234375, "logps/rejected": -264.9439392089844, "loss": 0.7386, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.72826886177063, "rewards/margins": 46.65421676635742, "rewards/rejected": -50.38248825073242, "step": 2150 }, { "epoch": 0.4239450441609421, "grad_norm": 371.83259748316146, "learning_rate": 3.565378105570097e-07, "logits/chosen": -2.456577777862549, "logits/rejected": -1.8832343816757202, "logps/chosen": -304.7551574707031, "logps/rejected": -271.1561279296875, "loss": 0.8575, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -20.67996597290039, "rewards/margins": 43.085914611816406, "rewards/rejected": -63.7658805847168, "step": 2160 }, { "epoch": 0.4259077526987242, "grad_norm": 79.37985826373466, "learning_rate": 3.549856773817107e-07, "logits/chosen": -2.3633110523223877, "logits/rejected": -1.906701683998108, "logps/chosen": -237.6493682861328, "logps/rejected": -286.90093994140625, "loss": 0.7012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9081697463989258, "rewards/margins": 71.75938415527344, "rewards/rejected": -70.85121154785156, "step": 2170 }, { "epoch": 0.4278704612365064, "grad_norm": 97.03362215312377, "learning_rate": 3.5342861531306344e-07, "logits/chosen": -2.5736143589019775, "logits/rejected": -2.09263014793396, "logps/chosen": -247.7324676513672, "logps/rejected": -261.7373352050781, "loss": 0.7408, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 6.46017599105835, "rewards/margins": 52.185691833496094, "rewards/rejected": -45.725521087646484, "step": 2180 }, { "epoch": 0.42983316977428854, "grad_norm": 84.06642123871259, "learning_rate": 3.518666974524002e-07, "logits/chosen": -2.4714062213897705, "logits/rejected": -1.950839638710022, "logps/chosen": -326.78900146484375, "logps/rejected": -349.24462890625, "loss": 0.7433, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -10.755969047546387, "rewards/margins": 65.1372299194336, "rewards/rejected": -75.89320373535156, "step": 2190 }, { "epoch": 0.43179587831207067, "grad_norm": 53.469223684053766, "learning_rate": 3.5029999712902387e-07, "logits/chosen": -2.8162899017333984, "logits/rejected": -2.569418430328369, "logps/chosen": -322.38470458984375, "logps/rejected": -391.33642578125, "loss": 0.729, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 6.919148921966553, "rewards/margins": 43.632606506347656, "rewards/rejected": -36.71345520019531, "step": 2200 }, { "epoch": 0.4337585868498528, "grad_norm": 72.77715086116586, "learning_rate": 3.4872858789676583e-07, "logits/chosen": -2.285487174987793, "logits/rejected": -2.385848045349121, "logps/chosen": -250.610595703125, "logps/rejected": -263.30145263671875, "loss": 0.8545, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -17.683425903320312, "rewards/margins": 10.88974666595459, "rewards/rejected": -28.573171615600586, "step": 2210 }, { "epoch": 0.43572129538763493, "grad_norm": 91.60096123212047, "learning_rate": 3.4715254353053236e-07, "logits/chosen": -2.5583057403564453, "logits/rejected": -2.00654935836792, "logps/chosen": -273.78204345703125, "logps/rejected": -345.2774963378906, "loss": 0.7759, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.849860429763794, "rewards/margins": 62.46953582763672, "rewards/rejected": -64.31939697265625, "step": 2220 }, { "epoch": 0.43768400392541706, "grad_norm": 108.55550907262693, "learning_rate": 3.4557193802284123e-07, "logits/chosen": -2.4978508949279785, "logits/rejected": -2.0906498432159424, "logps/chosen": -273.63299560546875, "logps/rejected": -309.2057800292969, "loss": 0.8405, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -8.902594566345215, "rewards/margins": 42.160133361816406, "rewards/rejected": -51.06272888183594, "step": 2230 }, { "epoch": 0.4396467124631992, "grad_norm": 74.1624535419027, "learning_rate": 3.4398684558034763e-07, "logits/chosen": -2.283446788787842, "logits/rejected": -2.1315581798553467, "logps/chosen": -256.9490661621094, "logps/rejected": -289.937744140625, "loss": 1.2659, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 11.52694320678711, "rewards/margins": 51.53630447387695, "rewards/rejected": -40.009361267089844, "step": 2240 }, { "epoch": 0.44160942100098133, "grad_norm": 131.73213213810183, "learning_rate": 3.4239734062036067e-07, "logits/chosen": -2.3359310626983643, "logits/rejected": -1.9699054956436157, "logps/chosen": -280.51312255859375, "logps/rejected": -339.35198974609375, "loss": 0.6753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.535205841064453, "rewards/margins": 63.005889892578125, "rewards/rejected": -59.47068405151367, "step": 2250 }, { "epoch": 0.4435721295387635, "grad_norm": 78.71608005921068, "learning_rate": 3.4080349776734924e-07, "logits/chosen": -2.3604981899261475, "logits/rejected": -1.8668235540390015, "logps/chosen": -293.6802978515625, "logps/rejected": -334.95367431640625, "loss": 0.7631, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 4.981619834899902, "rewards/margins": 62.29719161987305, "rewards/rejected": -57.315574645996094, "step": 2260 }, { "epoch": 0.44553483807654565, "grad_norm": 182.44555964483914, "learning_rate": 3.392053918494389e-07, "logits/chosen": -2.2618048191070557, "logits/rejected": -1.997514009475708, "logps/chosen": -297.23095703125, "logps/rejected": -309.30096435546875, "loss": 0.7756, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 9.042552947998047, "rewards/margins": 48.19974899291992, "rewards/rejected": -39.15719985961914, "step": 2270 }, { "epoch": 0.4474975466143278, "grad_norm": 123.11819302635585, "learning_rate": 3.376030978948983e-07, "logits/chosen": -2.002929210662842, "logits/rejected": -1.297489881515503, "logps/chosen": -352.963623046875, "logps/rejected": -368.9997253417969, "loss": 0.7702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.6209774017334, "rewards/margins": 51.065704345703125, "rewards/rejected": -81.68668365478516, "step": 2280 }, { "epoch": 0.4494602551521099, "grad_norm": 83.73241219291138, "learning_rate": 3.3599669112861756e-07, "logits/chosen": -2.3695321083068848, "logits/rejected": -1.7795994281768799, "logps/chosen": -277.69952392578125, "logps/rejected": -371.2049255371094, "loss": 0.7793, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -8.422652244567871, "rewards/margins": 70.8532485961914, "rewards/rejected": -79.2759017944336, "step": 2290 }, { "epoch": 0.45142296368989204, "grad_norm": 154.28270553435198, "learning_rate": 3.343862469685755e-07, "logits/chosen": -2.4093754291534424, "logits/rejected": -1.9804599285125732, "logps/chosen": -235.0762939453125, "logps/rejected": -278.19146728515625, "loss": 0.7443, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 5.140088081359863, "rewards/margins": 56.16924285888672, "rewards/rejected": -51.02915954589844, "step": 2300 }, { "epoch": 0.4533856722276742, "grad_norm": 79.16075258138234, "learning_rate": 3.3277184102230004e-07, "logits/chosen": -2.528869152069092, "logits/rejected": -2.3656325340270996, "logps/chosen": -259.2718811035156, "logps/rejected": -300.1874084472656, "loss": 0.796, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 10.365346908569336, "rewards/margins": 36.16173553466797, "rewards/rejected": -25.7963924407959, "step": 2310 }, { "epoch": 0.4553483807654563, "grad_norm": 101.30289632458975, "learning_rate": 3.311535490833176e-07, "logits/chosen": -2.159823417663574, "logits/rejected": -1.8414647579193115, "logps/chosen": -235.87802124023438, "logps/rejected": -321.6006774902344, "loss": 0.7954, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 13.073580741882324, "rewards/margins": 47.72212219238281, "rewards/rejected": -34.64854049682617, "step": 2320 }, { "epoch": 0.4573110893032385, "grad_norm": 99.15024459394922, "learning_rate": 3.2953144712759537e-07, "logits/chosen": -2.5285305976867676, "logits/rejected": -1.3595932722091675, "logps/chosen": -337.5126647949219, "logps/rejected": -333.326904296875, "loss": 0.7434, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 5.1543707847595215, "rewards/margins": 88.9986343383789, "rewards/rejected": -83.84425354003906, "step": 2330 }, { "epoch": 0.4592737978410206, "grad_norm": 77.40873417391985, "learning_rate": 3.279056113099742e-07, "logits/chosen": -2.3238701820373535, "logits/rejected": -1.9099338054656982, "logps/chosen": -281.14605712890625, "logps/rejected": -397.33056640625, "loss": 0.712, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.3794989585876465, "rewards/margins": 74.348388671875, "rewards/rejected": -67.96888732910156, "step": 2340 }, { "epoch": 0.46123650637880276, "grad_norm": 70.2241345593468, "learning_rate": 3.2627611796059283e-07, "logits/chosen": -2.2523574829101562, "logits/rejected": -1.8265279531478882, "logps/chosen": -280.9069519042969, "logps/rejected": -288.019287109375, "loss": 0.7744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.937745094299316, "rewards/margins": 41.65926742553711, "rewards/rejected": -49.597015380859375, "step": 2350 }, { "epoch": 0.4631992149165849, "grad_norm": 73.39606876693746, "learning_rate": 3.246430435813051e-07, "logits/chosen": -2.1988284587860107, "logits/rejected": -1.6169681549072266, "logps/chosen": -303.73712158203125, "logps/rejected": -281.5367431640625, "loss": 0.7851, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -30.069738388061523, "rewards/margins": 33.6140251159668, "rewards/rejected": -63.68375778198242, "step": 2360 }, { "epoch": 0.465161923454367, "grad_norm": 301.728340574257, "learning_rate": 3.230064648420878e-07, "logits/chosen": -2.492340564727783, "logits/rejected": -1.8189395666122437, "logps/chosen": -278.4673767089844, "logps/rejected": -230.20980834960938, "loss": 0.7568, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.769139051437378, "rewards/margins": 35.6246223449707, "rewards/rejected": -37.39376449584961, "step": 2370 }, { "epoch": 0.46712463199214915, "grad_norm": 111.74638031502089, "learning_rate": 3.2136645857744114e-07, "logits/chosen": -2.0232133865356445, "logits/rejected": -1.5899323225021362, "logps/chosen": -238.6339111328125, "logps/rejected": -329.6146240234375, "loss": 0.6911, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -6.115650177001953, "rewards/margins": 46.511444091796875, "rewards/rejected": -52.627098083496094, "step": 2380 }, { "epoch": 0.4690873405299313, "grad_norm": 68.76310478402067, "learning_rate": 3.197231017827818e-07, "logits/chosen": -2.267308235168457, "logits/rejected": -1.5626227855682373, "logps/chosen": -271.4465637207031, "logps/rejected": -309.33343505859375, "loss": 0.6671, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 10.058176040649414, "rewards/margins": 58.2335090637207, "rewards/rejected": -48.175323486328125, "step": 2390 }, { "epoch": 0.47105004906771347, "grad_norm": 90.36682455743738, "learning_rate": 3.1807647161082797e-07, "logits/chosen": -2.3689377307891846, "logits/rejected": -1.390862226486206, "logps/chosen": -274.2768249511719, "logps/rejected": -325.57861328125, "loss": 0.8014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5262811779975891, "rewards/margins": 68.48920440673828, "rewards/rejected": -67.9629135131836, "step": 2400 }, { "epoch": 0.4730127576054956, "grad_norm": 112.21087368475393, "learning_rate": 3.1642664536797693e-07, "logits/chosen": -2.00355863571167, "logits/rejected": -1.1795814037322998, "logps/chosen": -287.94110107421875, "logps/rejected": -344.9960021972656, "loss": 0.7288, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.945353507995605, "rewards/margins": 55.703651428222656, "rewards/rejected": -70.64900207519531, "step": 2410 }, { "epoch": 0.47497546614327774, "grad_norm": 94.7073509592962, "learning_rate": 3.147737005106762e-07, "logits/chosen": -2.006748676300049, "logits/rejected": -1.6412168741226196, "logps/chosen": -301.3794860839844, "logps/rejected": -350.7590637207031, "loss": 0.7193, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.3396103382110596, "rewards/margins": 73.76606750488281, "rewards/rejected": -76.10566711425781, "step": 2420 }, { "epoch": 0.47693817468105987, "grad_norm": 63.91702153405817, "learning_rate": 3.1311771464178655e-07, "logits/chosen": -1.7556310892105103, "logits/rejected": -0.991845428943634, "logps/chosen": -295.7560119628906, "logps/rejected": -280.8089294433594, "loss": 0.6876, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5076007843017578, "rewards/margins": 64.08277893066406, "rewards/rejected": -64.59038543701172, "step": 2430 }, { "epoch": 0.478900883218842, "grad_norm": 89.12126053333517, "learning_rate": 3.1145876550693893e-07, "logits/chosen": -2.3325510025024414, "logits/rejected": -1.3649237155914307, "logps/chosen": -290.2318115234375, "logps/rejected": -317.83966064453125, "loss": 0.633, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -4.481060028076172, "rewards/margins": 79.38648223876953, "rewards/rejected": -83.86753845214844, "step": 2440 }, { "epoch": 0.48086359175662413, "grad_norm": 195.3713158104856, "learning_rate": 3.097969309908847e-07, "logits/chosen": -1.782356858253479, "logits/rejected": -0.9384613037109375, "logps/chosen": -246.55908203125, "logps/rejected": -262.39031982421875, "loss": 0.7066, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -7.346467018127441, "rewards/margins": 48.303184509277344, "rewards/rejected": -55.64965057373047, "step": 2450 }, { "epoch": 0.48282630029440626, "grad_norm": 115.35614586098585, "learning_rate": 3.081322891138382e-07, "logits/chosen": -1.910089135169983, "logits/rejected": -1.6426465511322021, "logps/chosen": -304.06610107421875, "logps/rejected": -312.9458312988281, "loss": 0.6948, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 1.1127185821533203, "rewards/margins": 52.02979278564453, "rewards/rejected": -50.917076110839844, "step": 2460 }, { "epoch": 0.4847890088321884, "grad_norm": 62.473748679461835, "learning_rate": 3.0646491802781514e-07, "logits/chosen": -1.7581707239151, "logits/rejected": -1.28305983543396, "logps/chosen": -299.50616455078125, "logps/rejected": -217.08871459960938, "loss": 0.8111, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -32.85947799682617, "rewards/margins": 3.7508063316345215, "rewards/rejected": -36.610286712646484, "step": 2470 }, { "epoch": 0.4867517173699706, "grad_norm": 73.18523957876884, "learning_rate": 3.047948960129624e-07, "logits/chosen": -2.2639994621276855, "logits/rejected": -1.606888771057129, "logps/chosen": -203.5137481689453, "logps/rejected": -260.36712646484375, "loss": 0.7726, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 5.983282089233398, "rewards/margins": 48.81607437133789, "rewards/rejected": -42.832794189453125, "step": 2480 }, { "epoch": 0.4887144259077527, "grad_norm": 91.43059159846737, "learning_rate": 3.0312230147388334e-07, "logits/chosen": -2.6283905506134033, "logits/rejected": -1.8649413585662842, "logps/chosen": -301.3817443847656, "logps/rejected": -365.62310791015625, "loss": 0.7214, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 7.838557243347168, "rewards/margins": 66.22442626953125, "rewards/rejected": -58.38587188720703, "step": 2490 }, { "epoch": 0.49067713444553485, "grad_norm": 81.23215779996293, "learning_rate": 3.01447212935957e-07, "logits/chosen": -2.2705702781677246, "logits/rejected": -2.3151369094848633, "logps/chosen": -227.7688446044922, "logps/rejected": -270.2987365722656, "loss": 0.7301, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -1.3205721378326416, "rewards/margins": 17.929821014404297, "rewards/rejected": -19.25039291381836, "step": 2500 }, { "epoch": 0.492639842983317, "grad_norm": 167.33471423322146, "learning_rate": 2.9976970904165104e-07, "logits/chosen": -2.682750701904297, "logits/rejected": -1.8870923519134521, "logps/chosen": -374.3584899902344, "logps/rejected": -339.45806884765625, "loss": 0.7446, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 10.586302757263184, "rewards/margins": 55.3603401184082, "rewards/rejected": -44.77404022216797, "step": 2510 }, { "epoch": 0.4946025515210991, "grad_norm": 131.69231694679772, "learning_rate": 2.980898685468301e-07, "logits/chosen": -2.5056021213531494, "logits/rejected": -1.670548677444458, "logps/chosen": -290.6415100097656, "logps/rejected": -286.2872619628906, "loss": 0.7319, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 8.999971389770508, "rewards/margins": 62.56276321411133, "rewards/rejected": -53.56279373168945, "step": 2520 }, { "epoch": 0.49656526005888124, "grad_norm": 59.68499120601824, "learning_rate": 2.96407770317058e-07, "logits/chosen": -1.9421151876449585, "logits/rejected": -1.5854791402816772, "logps/chosen": -211.97714233398438, "logps/rejected": -255.60983276367188, "loss": 0.6687, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -3.5800819396972656, "rewards/margins": 43.62063217163086, "rewards/rejected": -47.20071792602539, "step": 2530 }, { "epoch": 0.4985279685966634, "grad_norm": 164.0221678743211, "learning_rate": 2.9472349332389523e-07, "logits/chosen": -2.2011830806732178, "logits/rejected": -1.3374918699264526, "logps/chosen": -305.1778564453125, "logps/rejected": -273.8752746582031, "loss": 0.8353, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -11.803207397460938, "rewards/margins": 62.515220642089844, "rewards/rejected": -74.31842803955078, "step": 2540 }, { "epoch": 0.5004906771344455, "grad_norm": 71.53189423853867, "learning_rate": 2.930371166411915e-07, "logits/chosen": -2.7879199981689453, "logits/rejected": -2.010589122772217, "logps/chosen": -310.1939392089844, "logps/rejected": -352.76483154296875, "loss": 0.8184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.575814723968506, "rewards/margins": 44.90160369873047, "rewards/rejected": -40.32579040527344, "step": 2550 }, { "epoch": 0.5024533856722276, "grad_norm": 128.20902449484515, "learning_rate": 2.913487194413731e-07, "logits/chosen": -2.5633718967437744, "logits/rejected": -1.9780218601226807, "logps/chosen": -265.4022521972656, "logps/rejected": -357.032470703125, "loss": 0.649, "rewards/accuracies": 0.8999999165534973, "rewards/chosen": 17.259857177734375, "rewards/margins": 95.27832794189453, "rewards/rejected": -78.01847076416016, "step": 2560 }, { "epoch": 0.5044160942100098, "grad_norm": 79.52293948292893, "learning_rate": 2.896583809917262e-07, "logits/chosen": -2.475602388381958, "logits/rejected": -2.0246129035949707, "logps/chosen": -216.0830841064453, "logps/rejected": -273.3958435058594, "loss": 0.8226, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 8.417243003845215, "rewards/margins": 56.79520797729492, "rewards/rejected": -48.377967834472656, "step": 2570 }, { "epoch": 0.5063788027477919, "grad_norm": 104.5175089055227, "learning_rate": 2.879661806506751e-07, "logits/chosen": -2.1604301929473877, "logits/rejected": -1.4284976720809937, "logps/chosen": -281.42132568359375, "logps/rejected": -357.19439697265625, "loss": 0.7612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.481987953186035, "rewards/margins": 39.04777908325195, "rewards/rejected": -47.52977752685547, "step": 2580 }, { "epoch": 0.5083415112855741, "grad_norm": 171.6517079835066, "learning_rate": 2.86272197864057e-07, "logits/chosen": -2.455636978149414, "logits/rejected": -2.167994499206543, "logps/chosen": -327.0727844238281, "logps/rejected": -309.888427734375, "loss": 0.7338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.537430763244629, "rewards/margins": 49.190189361572266, "rewards/rejected": -55.72761154174805, "step": 2590 }, { "epoch": 0.5103042198233563, "grad_norm": 105.3635720681855, "learning_rate": 2.845765121613912e-07, "logits/chosen": -2.2805140018463135, "logits/rejected": -0.9445091485977173, "logps/chosen": -285.6549987792969, "logps/rejected": -300.0010986328125, "loss": 0.6046, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 9.905165672302246, "rewards/margins": 111.06465148925781, "rewards/rejected": -101.15948486328125, "step": 2600 }, { "epoch": 0.5122669283611384, "grad_norm": 100.67805296522914, "learning_rate": 2.828792031521464e-07, "logits/chosen": -2.3688759803771973, "logits/rejected": -1.4235527515411377, "logps/chosen": -304.8942565917969, "logps/rejected": -388.78594970703125, "loss": 0.7488, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 9.871818542480469, "rewards/margins": 103.49615478515625, "rewards/rejected": -93.62432861328125, "step": 2610 }, { "epoch": 0.5142296368989205, "grad_norm": 71.08961886012678, "learning_rate": 2.811803505220025e-07, "logits/chosen": -2.3813610076904297, "logits/rejected": -1.6739622354507446, "logps/chosen": -258.174560546875, "logps/rejected": -265.59661865234375, "loss": 0.73, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 5.341302871704102, "rewards/margins": 63.24491500854492, "rewards/rejected": -57.90361785888672, "step": 2620 }, { "epoch": 0.5161923454367027, "grad_norm": 99.05141153903656, "learning_rate": 2.7948003402910975e-07, "logits/chosen": -2.3363749980926514, "logits/rejected": -1.9819015264511108, "logps/chosen": -305.27984619140625, "logps/rejected": -318.68353271484375, "loss": 0.8267, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -7.2348737716674805, "rewards/margins": 57.62543487548828, "rewards/rejected": -64.86031341552734, "step": 2630 }, { "epoch": 0.5181550539744848, "grad_norm": 65.07962407964867, "learning_rate": 2.777783335003442e-07, "logits/chosen": -2.550055980682373, "logits/rejected": -2.3385143280029297, "logps/chosen": -313.05810546875, "logps/rejected": -309.99072265625, "loss": 0.7406, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": -1.4819800853729248, "rewards/margins": 48.52701187133789, "rewards/rejected": -50.008995056152344, "step": 2640 }, { "epoch": 0.5201177625122669, "grad_norm": 109.60688120645206, "learning_rate": 2.760753288275598e-07, "logits/chosen": -2.6239876747131348, "logits/rejected": -1.9459753036499023, "logps/chosen": -246.33767700195312, "logps/rejected": -299.94464111328125, "loss": 0.8323, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 10.035989761352539, "rewards/margins": 57.07807159423828, "rewards/rejected": -47.042076110839844, "step": 2650 }, { "epoch": 0.5220804710500491, "grad_norm": 67.86790803569278, "learning_rate": 2.7437109996383795e-07, "logits/chosen": -2.5884299278259277, "logits/rejected": -2.1382975578308105, "logps/chosen": -258.0784606933594, "logps/rejected": -246.015869140625, "loss": 0.7776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.644412040710449, "rewards/margins": 33.16940689086914, "rewards/rejected": -35.813819885253906, "step": 2660 }, { "epoch": 0.5240431795878312, "grad_norm": 84.09210913221163, "learning_rate": 2.7266572691973365e-07, "logits/chosen": -2.7655482292175293, "logits/rejected": -2.4460482597351074, "logps/chosen": -329.2676086425781, "logps/rejected": -299.7054748535156, "loss": 0.7367, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 4.372633934020996, "rewards/margins": 27.363567352294922, "rewards/rejected": -22.99093246459961, "step": 2670 }, { "epoch": 0.5260058881256133, "grad_norm": 107.64740597069778, "learning_rate": 2.709592897595191e-07, "logits/chosen": -2.3843436241149902, "logits/rejected": -1.7544450759887695, "logps/chosen": -263.9532165527344, "logps/rejected": -209.3268280029297, "loss": 0.7304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 11.84433650970459, "rewards/margins": 35.54861831665039, "rewards/rejected": -23.70427703857422, "step": 2680 }, { "epoch": 0.5279685966633955, "grad_norm": 106.75615926888128, "learning_rate": 2.6925186859742494e-07, "logits/chosen": -2.3925697803497314, "logits/rejected": -2.174147367477417, "logps/chosen": -252.1218719482422, "logps/rejected": -263.73565673828125, "loss": 0.7874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.29880952835083, "rewards/margins": 41.08768844604492, "rewards/rejected": -35.788875579833984, "step": 2690 }, { "epoch": 0.5299313052011776, "grad_norm": 82.92674985804514, "learning_rate": 2.675435435938788e-07, "logits/chosen": -2.337364435195923, "logits/rejected": -1.8014519214630127, "logps/chosen": -313.87237548828125, "logps/rejected": -318.05291748046875, "loss": 0.7379, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 1.7383445501327515, "rewards/margins": 63.7138557434082, "rewards/rejected": -61.97551345825195, "step": 2700 }, { "epoch": 0.5318940137389597, "grad_norm": 76.45153211146688, "learning_rate": 2.6583439495174247e-07, "logits/chosen": -2.5304274559020996, "logits/rejected": -1.5630953311920166, "logps/chosen": -284.4434814453125, "logps/rejected": -301.16082763671875, "loss": 0.7187, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.455230712890625, "rewards/margins": 67.56059265136719, "rewards/rejected": -61.10536575317383, "step": 2710 }, { "epoch": 0.5338567222767419, "grad_norm": 144.29258862250813, "learning_rate": 2.6412450291254564e-07, "logits/chosen": -1.7982561588287354, "logits/rejected": -1.105381727218628, "logps/chosen": -301.31243896484375, "logps/rejected": -301.9142761230469, "loss": 0.7467, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -9.941731452941895, "rewards/margins": 50.88005065917969, "rewards/rejected": -60.82177734375, "step": 2720 }, { "epoch": 0.535819430814524, "grad_norm": 83.07204946376287, "learning_rate": 2.6241394775271954e-07, "logits/chosen": -1.6235328912734985, "logits/rejected": -0.476736456155777, "logps/chosen": -264.97265625, "logps/rejected": -323.4992370605469, "loss": 0.7486, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -4.446869373321533, "rewards/margins": 89.97184753417969, "rewards/rejected": -94.41870880126953, "step": 2730 }, { "epoch": 0.5377821393523062, "grad_norm": 92.95743256135574, "learning_rate": 2.607028097798276e-07, "logits/chosen": -1.6831121444702148, "logits/rejected": -0.7414560914039612, "logps/chosen": -287.6153259277344, "logps/rejected": -360.23614501953125, "loss": 0.6514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.644578218460083, "rewards/margins": 65.76050567626953, "rewards/rejected": -67.40509033203125, "step": 2740 }, { "epoch": 0.5397448478900884, "grad_norm": 104.03853453451688, "learning_rate": 2.5899116932879534e-07, "logits/chosen": -1.2863852977752686, "logits/rejected": -0.6404994130134583, "logps/chosen": -210.6661834716797, "logps/rejected": -292.912841796875, "loss": 0.6845, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -10.76870346069336, "rewards/margins": 73.64253234863281, "rewards/rejected": -84.41123962402344, "step": 2750 }, { "epoch": 0.5417075564278705, "grad_norm": 91.91012414846846, "learning_rate": 2.5727910675813866e-07, "logits/chosen": -1.5865745544433594, "logits/rejected": -1.3802332878112793, "logps/chosen": -251.1632537841797, "logps/rejected": -316.54931640625, "loss": 0.7554, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.7482845783233643, "rewards/margins": 46.00225067138672, "rewards/rejected": -44.253963470458984, "step": 2760 }, { "epoch": 0.5436702649656526, "grad_norm": 129.25108221566921, "learning_rate": 2.555667024461915e-07, "logits/chosen": -1.523798942565918, "logits/rejected": -1.3356215953826904, "logps/chosen": -245.5888671875, "logps/rejected": -348.0199890136719, "loss": 0.8582, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -12.945791244506836, "rewards/margins": 69.13800048828125, "rewards/rejected": -82.08379364013672, "step": 2770 }, { "epoch": 0.5456329735034348, "grad_norm": 99.1092761560908, "learning_rate": 2.5385403678733157e-07, "logits/chosen": -1.4167115688323975, "logits/rejected": -0.981336236000061, "logps/chosen": -215.4586181640625, "logps/rejected": -294.55072021484375, "loss": 0.8466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.262603759765625, "rewards/margins": 46.738868713378906, "rewards/rejected": -63.00145721435547, "step": 2780 }, { "epoch": 0.5475956820412169, "grad_norm": 80.71398682969887, "learning_rate": 2.521411901882067e-07, "logits/chosen": -1.9221446514129639, "logits/rejected": -0.9243489503860474, "logps/chosen": -276.59759521484375, "logps/rejected": -289.756591796875, "loss": 0.7996, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -13.792913436889648, "rewards/margins": 69.9376449584961, "rewards/rejected": -83.73055267333984, "step": 2790 }, { "epoch": 0.549558390578999, "grad_norm": 77.31118150884694, "learning_rate": 2.504282430639594e-07, "logits/chosen": -2.344576120376587, "logits/rejected": -1.9155938625335693, "logps/chosen": -213.6534881591797, "logps/rejected": -256.0126037597656, "loss": 0.7238, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -5.494402885437012, "rewards/margins": 45.93522644042969, "rewards/rejected": -51.42962646484375, "step": 2800 }, { "epoch": 0.5515210991167812, "grad_norm": 127.35730721555944, "learning_rate": 2.4871527583445163e-07, "logits/chosen": -2.4504616260528564, "logits/rejected": -1.967984914779663, "logps/chosen": -303.8737487792969, "logps/rejected": -300.02294921875, "loss": 0.7431, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 5.556253433227539, "rewards/margins": 32.0588264465332, "rewards/rejected": -26.502573013305664, "step": 2810 }, { "epoch": 0.5534838076545633, "grad_norm": 92.9662797360762, "learning_rate": 2.470023689204893e-07, "logits/chosen": -2.4216175079345703, "logits/rejected": -2.0114285945892334, "logps/chosen": -293.913330078125, "logps/rejected": -328.92376708984375, "loss": 0.7388, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.6426097750663757, "rewards/margins": 49.677894592285156, "rewards/rejected": -49.03528594970703, "step": 2820 }, { "epoch": 0.5554465161923454, "grad_norm": 95.38839192188959, "learning_rate": 2.452896027400465e-07, "logits/chosen": -2.54085111618042, "logits/rejected": -1.7473074197769165, "logps/chosen": -296.4548034667969, "logps/rejected": -350.0361022949219, "loss": 0.7363, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -15.886466979980469, "rewards/margins": 51.79584884643555, "rewards/rejected": -67.68231201171875, "step": 2830 }, { "epoch": 0.5574092247301276, "grad_norm": 109.14038375179702, "learning_rate": 2.4357705770449046e-07, "logits/chosen": -2.463552713394165, "logits/rejected": -1.8203128576278687, "logps/chosen": -238.7277069091797, "logps/rejected": -308.0860290527344, "loss": 0.687, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 11.133429527282715, "rewards/margins": 81.58220672607422, "rewards/rejected": -70.44879150390625, "step": 2840 }, { "epoch": 0.5593719332679097, "grad_norm": 94.50743778154494, "learning_rate": 2.418648142148056e-07, "logits/chosen": -2.5345070362091064, "logits/rejected": -2.023996114730835, "logps/chosen": -283.60638427734375, "logps/rejected": -269.5262145996094, "loss": 0.7609, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 0.20322799682617188, "rewards/margins": 52.58563232421875, "rewards/rejected": -52.382408142089844, "step": 2850 }, { "epoch": 0.5613346418056918, "grad_norm": 83.02242956305145, "learning_rate": 2.4015295265781966e-07, "logits/chosen": -2.2599425315856934, "logits/rejected": -1.7763019800186157, "logps/chosen": -287.1423034667969, "logps/rejected": -370.6893310546875, "loss": 0.6772, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 11.106006622314453, "rewards/margins": 73.1583480834961, "rewards/rejected": -62.052345275878906, "step": 2860 }, { "epoch": 0.563297350343474, "grad_norm": 139.56476754754868, "learning_rate": 2.3844155340242893e-07, "logits/chosen": -2.2458949089050293, "logits/rejected": -1.4891345500946045, "logps/chosen": -207.64797973632812, "logps/rejected": -278.21185302734375, "loss": 0.7197, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 8.496252059936523, "rewards/margins": 67.86898040771484, "rewards/rejected": -59.37272262573242, "step": 2870 }, { "epoch": 0.5652600588812562, "grad_norm": 106.89406564492629, "learning_rate": 2.36730696795826e-07, "logits/chosen": -2.2247424125671387, "logits/rejected": -1.844606637954712, "logps/chosen": -255.03076171875, "logps/rejected": -380.1404724121094, "loss": 0.7294, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.823201179504395, "rewards/margins": 29.502965927124023, "rewards/rejected": -44.32616424560547, "step": 2880 }, { "epoch": 0.5672227674190383, "grad_norm": 81.08938483786679, "learning_rate": 2.3502046315972655e-07, "logits/chosen": -1.9563055038452148, "logits/rejected": -1.0683209896087646, "logps/chosen": -318.98309326171875, "logps/rejected": -375.96575927734375, "loss": 0.6133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.23508186638355255, "rewards/margins": 81.9447250366211, "rewards/rejected": -82.1798095703125, "step": 2890 }, { "epoch": 0.5691854759568205, "grad_norm": 163.15605757692506, "learning_rate": 2.3331093278659906e-07, "logits/chosen": -2.006662368774414, "logits/rejected": -1.3949081897735596, "logps/chosen": -317.40667724609375, "logps/rejected": -352.4535217285156, "loss": 1.2011, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -8.814798355102539, "rewards/margins": 59.48215866088867, "rewards/rejected": -68.29695892333984, "step": 2900 }, { "epoch": 0.5711481844946026, "grad_norm": 115.70583417864563, "learning_rate": 2.31602185935895e-07, "logits/chosen": -2.1096019744873047, "logits/rejected": -1.2632701396942139, "logps/chosen": -287.8899230957031, "logps/rejected": -291.636474609375, "loss": 0.7118, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -4.083498954772949, "rewards/margins": 64.30267333984375, "rewards/rejected": -68.38616943359375, "step": 2910 }, { "epoch": 0.5731108930323847, "grad_norm": 117.20749269466828, "learning_rate": 2.298943028302811e-07, "logits/chosen": -2.391432762145996, "logits/rejected": -1.5941880941390991, "logps/chosen": -279.3251037597656, "logps/rejected": -391.8353576660156, "loss": 0.6535, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 10.151693344116211, "rewards/margins": 90.73081970214844, "rewards/rejected": -80.57913208007812, "step": 2920 }, { "epoch": 0.5750736015701668, "grad_norm": 86.12049886744171, "learning_rate": 2.2818736365187242e-07, "logits/chosen": -2.1691224575042725, "logits/rejected": -1.5063917636871338, "logps/chosen": -219.89309692382812, "logps/rejected": -252.3799591064453, "loss": 0.6329, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 7.1943769454956055, "rewards/margins": 67.58497619628906, "rewards/rejected": -60.390602111816406, "step": 2930 }, { "epoch": 0.577036310107949, "grad_norm": 223.79729637803106, "learning_rate": 2.2648144853846847e-07, "logits/chosen": -1.6522821187973022, "logits/rejected": -0.9141775965690613, "logps/chosen": -265.09088134765625, "logps/rejected": -352.2686462402344, "loss": 0.6046, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -5.908082962036133, "rewards/margins": 80.85003662109375, "rewards/rejected": -86.75811004638672, "step": 2940 }, { "epoch": 0.5789990186457311, "grad_norm": 118.75192042619383, "learning_rate": 2.247766375797906e-07, "logits/chosen": -1.7435131072998047, "logits/rejected": -1.5601297616958618, "logps/chosen": -185.4170684814453, "logps/rejected": -228.70193481445312, "loss": 0.7691, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 5.978199481964111, "rewards/margins": 29.892385482788086, "rewards/rejected": -23.9141845703125, "step": 2950 }, { "epoch": 0.5809617271835132, "grad_norm": 160.64090320849542, "learning_rate": 2.2307301081372222e-07, "logits/chosen": -1.534781813621521, "logits/rejected": -1.6206640005111694, "logps/chosen": -256.223876953125, "logps/rejected": -298.49688720703125, "loss": 0.7473, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 1.673948049545288, "rewards/margins": 24.34869384765625, "rewards/rejected": -22.674747467041016, "step": 2960 }, { "epoch": 0.5829244357212954, "grad_norm": 88.3933862090185, "learning_rate": 2.2137064822255086e-07, "logits/chosen": -1.7883812189102173, "logits/rejected": -1.5013943910598755, "logps/chosen": -235.02685546875, "logps/rejected": -263.28741455078125, "loss": 0.7735, "rewards/accuracies": 0.533333420753479, "rewards/chosen": -14.268635749816895, "rewards/margins": 33.0501708984375, "rewards/rejected": -47.31879806518555, "step": 2970 }, { "epoch": 0.5848871442590775, "grad_norm": 116.70842887482725, "learning_rate": 2.1966962972921322e-07, "logits/chosen": -2.0745081901550293, "logits/rejected": -1.5815775394439697, "logps/chosen": -238.42245483398438, "logps/rejected": -313.05804443359375, "loss": 0.8214, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -1.5816704034805298, "rewards/margins": 41.731468200683594, "rewards/rejected": -43.313133239746094, "step": 2980 }, { "epoch": 0.5868498527968596, "grad_norm": 152.0310716683447, "learning_rate": 2.1797003519354285e-07, "logits/chosen": -2.078899621963501, "logits/rejected": -1.6466604471206665, "logps/chosen": -254.897705078125, "logps/rejected": -308.11444091796875, "loss": 0.7389, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 6.972849369049072, "rewards/margins": 51.743011474609375, "rewards/rejected": -44.77016067504883, "step": 2990 }, { "epoch": 0.5888125613346418, "grad_norm": 78.86911708574102, "learning_rate": 2.1627194440852142e-07, "logits/chosen": -1.433844804763794, "logits/rejected": -1.1446534395217896, "logps/chosen": -305.498779296875, "logps/rejected": -318.3632507324219, "loss": 0.7851, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -11.723398208618164, "rewards/margins": 48.450172424316406, "rewards/rejected": -60.17356491088867, "step": 3000 }, { "epoch": 0.5907752698724239, "grad_norm": 108.7837836847545, "learning_rate": 2.1457543709653176e-07, "logits/chosen": -1.8923429250717163, "logits/rejected": -1.2739951610565186, "logps/chosen": -288.7276306152344, "logps/rejected": -341.6163024902344, "loss": 0.7189, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -14.93413257598877, "rewards/margins": 74.93661499023438, "rewards/rejected": -89.87073516845703, "step": 3010 }, { "epoch": 0.592737978410206, "grad_norm": 84.0113178897826, "learning_rate": 2.128805929056154e-07, "logits/chosen": -1.9766048192977905, "logits/rejected": -1.394619107246399, "logps/chosen": -186.0647735595703, "logps/rejected": -278.7615051269531, "loss": 0.7496, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -5.230257511138916, "rewards/margins": 78.95034790039062, "rewards/rejected": -84.18060302734375, "step": 3020 }, { "epoch": 0.5947006869479883, "grad_norm": 211.17704262942456, "learning_rate": 2.1118749140573358e-07, "logits/chosen": -2.093601942062378, "logits/rejected": -1.4542027711868286, "logps/chosen": -260.0766296386719, "logps/rejected": -298.0777587890625, "loss": 0.7183, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -5.917730331420898, "rewards/margins": 40.93926239013672, "rewards/rejected": -46.85699462890625, "step": 3030 }, { "epoch": 0.5966633954857704, "grad_norm": 88.15194406511203, "learning_rate": 2.0949621208503092e-07, "logits/chosen": -2.0554678440093994, "logits/rejected": -1.6898086071014404, "logps/chosen": -291.20843505859375, "logps/rejected": -249.0556640625, "loss": 0.8225, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 9.513040542602539, "rewards/margins": 37.10463333129883, "rewards/rejected": -27.591594696044922, "step": 3040 }, { "epoch": 0.5986261040235525, "grad_norm": 299.15890226415377, "learning_rate": 2.0780683434610413e-07, "logits/chosen": -2.0362281799316406, "logits/rejected": -1.5792611837387085, "logps/chosen": -251.09896850585938, "logps/rejected": -310.0597839355469, "loss": 0.7419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -10.650520324707031, "rewards/margins": 44.67280960083008, "rewards/rejected": -55.323326110839844, "step": 3050 }, { "epoch": 0.6005888125613347, "grad_norm": 127.41508729777355, "learning_rate": 2.0611943750227375e-07, "logits/chosen": -1.999168038368225, "logits/rejected": -1.304848313331604, "logps/chosen": -272.5399169921875, "logps/rejected": -311.75701904296875, "loss": 0.7465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -9.94316577911377, "rewards/margins": 58.5887336730957, "rewards/rejected": -68.53189849853516, "step": 3060 }, { "epoch": 0.6025515210991168, "grad_norm": 99.13510587153851, "learning_rate": 2.044341007738612e-07, "logits/chosen": -2.30557918548584, "logits/rejected": -1.5714919567108154, "logps/chosen": -351.8255920410156, "logps/rejected": -333.9390563964844, "loss": 0.8532, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.9636003375053406, "rewards/margins": 35.39982223510742, "rewards/rejected": -36.36342239379883, "step": 3070 }, { "epoch": 0.6045142296368989, "grad_norm": 84.93092944534997, "learning_rate": 2.027509032844687e-07, "logits/chosen": -2.250655174255371, "logits/rejected": -2.2882039546966553, "logps/chosen": -338.1631774902344, "logps/rejected": -394.54608154296875, "loss": 0.7667, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -18.098678588867188, "rewards/margins": 24.721378326416016, "rewards/rejected": -42.8200569152832, "step": 3080 }, { "epoch": 0.6064769381746811, "grad_norm": 83.77935755289745, "learning_rate": 2.010699240572651e-07, "logits/chosen": -2.414731502532959, "logits/rejected": -2.2311673164367676, "logps/chosen": -344.16973876953125, "logps/rejected": -356.10650634765625, "loss": 0.7526, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 14.781620979309082, "rewards/margins": 45.816993713378906, "rewards/rejected": -31.035375595092773, "step": 3090 }, { "epoch": 0.6084396467124632, "grad_norm": 133.4488698039462, "learning_rate": 1.993912420112756e-07, "logits/chosen": -2.322948694229126, "logits/rejected": -1.7190580368041992, "logps/chosen": -306.60784912109375, "logps/rejected": -415.96502685546875, "loss": 1.2366, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 16.838117599487305, "rewards/margins": 48.656890869140625, "rewards/rejected": -31.818775177001953, "step": 3100 }, { "epoch": 0.6104023552502453, "grad_norm": 263.54661849515423, "learning_rate": 1.9771493595767707e-07, "logits/chosen": -2.306044816970825, "logits/rejected": -2.063917398452759, "logps/chosen": -282.313720703125, "logps/rejected": -372.73968505859375, "loss": 0.773, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 3.3695220947265625, "rewards/margins": 49.335235595703125, "rewards/rejected": -45.96570587158203, "step": 3110 }, { "epoch": 0.6123650637880275, "grad_norm": 92.12264659197632, "learning_rate": 1.9604108459609752e-07, "logits/chosen": -2.1388299465179443, "logits/rejected": -1.7471414804458618, "logps/chosen": -322.26092529296875, "logps/rejected": -366.0241394042969, "loss": 0.7449, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -2.502537965774536, "rewards/margins": 45.889347076416016, "rewards/rejected": -48.391883850097656, "step": 3120 }, { "epoch": 0.6143277723258096, "grad_norm": 80.59078794851929, "learning_rate": 1.9436976651092142e-07, "logits/chosen": -2.146054744720459, "logits/rejected": -1.5463652610778809, "logps/chosen": -315.1632995605469, "logps/rejected": -363.4345703125, "loss": 0.7006, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6.4701738357543945, "rewards/margins": 57.543663024902344, "rewards/rejected": -64.01383972167969, "step": 3130 }, { "epoch": 0.6162904808635917, "grad_norm": 192.68796754234208, "learning_rate": 1.9270106016760035e-07, "logits/chosen": -2.29899001121521, "logits/rejected": -1.482499122619629, "logps/chosen": -269.68560791015625, "logps/rejected": -346.1686706542969, "loss": 0.7765, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -5.903042793273926, "rewards/margins": 67.49510192871094, "rewards/rejected": -73.39815521240234, "step": 3140 }, { "epoch": 0.6182531894013739, "grad_norm": 96.38734154352139, "learning_rate": 1.9103504390896944e-07, "logits/chosen": -1.9073091745376587, "logits/rejected": -1.1880276203155518, "logps/chosen": -233.3427276611328, "logps/rejected": -368.07293701171875, "loss": 0.7109, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.45210075378418, "rewards/margins": 72.71215057373047, "rewards/rejected": -82.16426086425781, "step": 3150 }, { "epoch": 0.620215897939156, "grad_norm": 134.84106765197814, "learning_rate": 1.8937179595156876e-07, "logits/chosen": -2.4171786308288574, "logits/rejected": -1.2940549850463867, "logps/chosen": -276.4836120605469, "logps/rejected": -272.7891540527344, "loss": 0.7108, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 2.4751052856445312, "rewards/margins": 66.28428649902344, "rewards/rejected": -63.80919647216797, "step": 3160 }, { "epoch": 0.6221786064769381, "grad_norm": 121.61969829850759, "learning_rate": 1.8771139438197168e-07, "logits/chosen": -2.3684420585632324, "logits/rejected": -1.672045111656189, "logps/chosen": -282.76666259765625, "logps/rejected": -351.12384033203125, "loss": 0.7738, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 9.382087707519531, "rewards/margins": 61.03277587890625, "rewards/rejected": -51.65069580078125, "step": 3170 }, { "epoch": 0.6241413150147204, "grad_norm": 100.05837592598611, "learning_rate": 1.8605391715311846e-07, "logits/chosen": -1.9672191143035889, "logits/rejected": -1.187548279762268, "logps/chosen": -284.04742431640625, "logps/rejected": -250.73623657226562, "loss": 0.7156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0008848905563354, "rewards/margins": 44.5206298828125, "rewards/rejected": -43.519737243652344, "step": 3180 }, { "epoch": 0.6261040235525025, "grad_norm": 73.4831768339145, "learning_rate": 1.8439944208065704e-07, "logits/chosen": -2.0867557525634766, "logits/rejected": -1.4983081817626953, "logps/chosen": -340.9936218261719, "logps/rejected": -376.168212890625, "loss": 0.7919, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 8.620040893554688, "rewards/margins": 58.236595153808594, "rewards/rejected": -49.61655807495117, "step": 3190 }, { "epoch": 0.6280667320902846, "grad_norm": 299.8154575002267, "learning_rate": 1.8274804683928913e-07, "logits/chosen": -1.997650146484375, "logits/rejected": -1.242893934249878, "logps/chosen": -336.35943603515625, "logps/rejected": -350.4289855957031, "loss": 0.7959, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -3.4962031841278076, "rewards/margins": 63.2283821105957, "rewards/rejected": -66.7245864868164, "step": 3200 }, { "epoch": 0.6300294406280668, "grad_norm": 64.31674145463639, "learning_rate": 1.810998089591238e-07, "logits/chosen": -2.248622179031372, "logits/rejected": -1.8068727254867554, "logps/chosen": -249.48519897460938, "logps/rejected": -312.36114501953125, "loss": 0.7536, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 3.5885841846466064, "rewards/margins": 57.19319534301758, "rewards/rejected": -53.6046142578125, "step": 3210 }, { "epoch": 0.6319921491658489, "grad_norm": 110.92174498667798, "learning_rate": 1.7945480582203745e-07, "logits/chosen": -2.072582960128784, "logits/rejected": -1.7200475931167603, "logps/chosen": -262.9791564941406, "logps/rejected": -316.9862060546875, "loss": 0.7465, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -14.136678695678711, "rewards/margins": 35.017723083496094, "rewards/rejected": -49.15439987182617, "step": 3220 }, { "epoch": 0.633954857703631, "grad_norm": 74.911476704379, "learning_rate": 1.7781311465804128e-07, "logits/chosen": -2.216766834259033, "logits/rejected": -1.4819574356079102, "logps/chosen": -274.76068115234375, "logps/rejected": -271.5936584472656, "loss": 0.7988, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 4.339162349700928, "rewards/margins": 42.367881774902344, "rewards/rejected": -38.028717041015625, "step": 3230 }, { "epoch": 0.6359175662414132, "grad_norm": 93.94788784342087, "learning_rate": 1.7617481254165487e-07, "logits/chosen": -2.1821932792663574, "logits/rejected": -1.6851694583892822, "logps/chosen": -266.036376953125, "logps/rejected": -279.02947998046875, "loss": 0.7283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.567543983459473, "rewards/margins": 61.737396240234375, "rewards/rejected": -56.16984939575195, "step": 3240 }, { "epoch": 0.6378802747791953, "grad_norm": 87.31741958467255, "learning_rate": 1.745399763882881e-07, "logits/chosen": -2.1134731769561768, "logits/rejected": -1.253485918045044, "logps/chosen": -284.7551574707031, "logps/rejected": -366.5562438964844, "loss": 0.6709, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 4.997917175292969, "rewards/margins": 94.76776885986328, "rewards/rejected": -89.76985168457031, "step": 3250 }, { "epoch": 0.6398429833169774, "grad_norm": 60.84219590839291, "learning_rate": 1.7290868295062983e-07, "logits/chosen": -2.067007064819336, "logits/rejected": -1.5906989574432373, "logps/chosen": -269.60015869140625, "logps/rejected": -329.2178039550781, "loss": 0.7044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 17.791763305664062, "rewards/margins": 62.0831298828125, "rewards/rejected": -44.29136276245117, "step": 3260 }, { "epoch": 0.6418056918547596, "grad_norm": 114.12446380006371, "learning_rate": 1.7128100881504492e-07, "logits/chosen": -2.0023036003112793, "logits/rejected": -1.1316359043121338, "logps/chosen": -283.4881286621094, "logps/rejected": -272.5941162109375, "loss": 0.761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0853993892669678, "rewards/margins": 58.28447341918945, "rewards/rejected": -55.199073791503906, "step": 3270 }, { "epoch": 0.6437684003925417, "grad_norm": 72.10740342590225, "learning_rate": 1.6965703039797808e-07, "logits/chosen": -2.162951707839966, "logits/rejected": -0.9369997978210449, "logps/chosen": -316.176513671875, "logps/rejected": -316.67218017578125, "loss": 0.6602, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.7439907193183899, "rewards/margins": 88.97277069091797, "rewards/rejected": -89.71675872802734, "step": 3280 }, { "epoch": 0.6457311089303238, "grad_norm": 96.76507271120425, "learning_rate": 1.6803682394236656e-07, "logits/chosen": -2.1952121257781982, "logits/rejected": -1.4405162334442139, "logps/chosen": -328.23956298828125, "logps/rejected": -340.05218505859375, "loss": 0.7299, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 4.097502708435059, "rewards/margins": 75.30216217041016, "rewards/rejected": -71.20466613769531, "step": 3290 }, { "epoch": 0.647693817468106, "grad_norm": 162.37668079466067, "learning_rate": 1.664204655140607e-07, "logits/chosen": -1.946126937866211, "logits/rejected": -1.663977026939392, "logps/chosen": -236.5516357421875, "logps/rejected": -345.7264099121094, "loss": 0.7726, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.7376834154129028, "rewards/margins": 67.84037017822266, "rewards/rejected": -67.10269165039062, "step": 3300 }, { "epoch": 0.6496565260058881, "grad_norm": 90.12397498786609, "learning_rate": 1.6480803099825277e-07, "logits/chosen": -1.9293769598007202, "logits/rejected": -1.2741062641143799, "logps/chosen": -264.91790771484375, "logps/rejected": -296.45794677734375, "loss": 0.7108, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3.28875470161438, "rewards/margins": 71.97184753417969, "rewards/rejected": -75.26060485839844, "step": 3310 }, { "epoch": 0.6516192345436702, "grad_norm": 165.55543993004838, "learning_rate": 1.6319959609591412e-07, "logits/chosen": -1.412754774093628, "logits/rejected": -1.176635980606079, "logps/chosen": -270.6043395996094, "logps/rejected": -241.142333984375, "loss": 0.7738, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -28.029138565063477, "rewards/margins": 17.126901626586914, "rewards/rejected": -45.15604019165039, "step": 3320 }, { "epoch": 0.6535819430814525, "grad_norm": 67.42258639911547, "learning_rate": 1.6159523632024126e-07, "logits/chosen": -2.031461715698242, "logits/rejected": -1.4194605350494385, "logps/chosen": -303.0947570800781, "logps/rejected": -380.33099365234375, "loss": 0.7525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -13.691067695617676, "rewards/margins": 51.06209945678711, "rewards/rejected": -64.75315856933594, "step": 3330 }, { "epoch": 0.6555446516192346, "grad_norm": 130.1594949987406, "learning_rate": 1.599950269931107e-07, "logits/chosen": -1.7198108434677124, "logits/rejected": -1.4300183057785034, "logps/chosen": -308.6605529785156, "logps/rejected": -285.02655029296875, "loss": 0.7897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -12.423626899719238, "rewards/margins": 20.34939193725586, "rewards/rejected": -32.77301788330078, "step": 3340 }, { "epoch": 0.6575073601570167, "grad_norm": 94.0488349002007, "learning_rate": 1.5839904324154273e-07, "logits/chosen": -2.0419249534606934, "logits/rejected": -1.3615930080413818, "logps/chosen": -284.0905456542969, "logps/rejected": -341.1413879394531, "loss": 0.73, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9335086941719055, "rewards/margins": 46.902244567871094, "rewards/rejected": -45.96873474121094, "step": 3350 }, { "epoch": 0.6594700686947988, "grad_norm": 111.57098953892903, "learning_rate": 1.568073599941742e-07, "logits/chosen": -2.108177661895752, "logits/rejected": -1.7325401306152344, "logps/chosen": -300.68646240234375, "logps/rejected": -333.4554748535156, "loss": 0.6875, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.0172219276428223, "rewards/margins": 57.87189483642578, "rewards/rejected": -59.88911056518555, "step": 3360 }, { "epoch": 0.661432777232581, "grad_norm": 68.16727284422387, "learning_rate": 1.552200519777408e-07, "logits/chosen": -2.2538270950317383, "logits/rejected": -1.382280945777893, "logps/chosen": -319.42095947265625, "logps/rejected": -317.35577392578125, "loss": 0.6716, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.7386177182197571, "rewards/margins": 68.20567321777344, "rewards/rejected": -68.94429016113281, "step": 3370 }, { "epoch": 0.6633954857703631, "grad_norm": 68.6003702867741, "learning_rate": 1.5363719371356882e-07, "logits/chosen": -2.4917187690734863, "logits/rejected": -2.0385372638702393, "logps/chosen": -339.6763000488281, "logps/rejected": -271.23944091796875, "loss": 0.7439, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.0836315155029297, "rewards/margins": 44.689064025878906, "rewards/rejected": -43.605430603027344, "step": 3380 }, { "epoch": 0.6653581943081452, "grad_norm": 60.929551677940225, "learning_rate": 1.5205885951407665e-07, "logits/chosen": -2.0505824089050293, "logits/rejected": -1.8102777004241943, "logps/chosen": -263.76776123046875, "logps/rejected": -367.0315856933594, "loss": 0.6293, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -9.958749771118164, "rewards/margins": 54.30242919921875, "rewards/rejected": -64.26116943359375, "step": 3390 }, { "epoch": 0.6673209028459274, "grad_norm": 90.11509384549309, "learning_rate": 1.5048512347928564e-07, "logits/chosen": -1.773870825767517, "logits/rejected": -0.5375600457191467, "logps/chosen": -290.7041320800781, "logps/rejected": -292.1522521972656, "loss": 0.6565, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -2.4120681285858154, "rewards/margins": 87.38268280029297, "rewards/rejected": -89.79476165771484, "step": 3400 }, { "epoch": 0.6692836113837095, "grad_norm": 149.31298241670422, "learning_rate": 1.4891605949334133e-07, "logits/chosen": -2.053164482116699, "logits/rejected": -0.711159884929657, "logps/chosen": -481.1689453125, "logps/rejected": -481.52410888671875, "loss": 0.7443, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -8.358963012695312, "rewards/margins": 93.5754165649414, "rewards/rejected": -101.93437194824219, "step": 3410 }, { "epoch": 0.6712463199214916, "grad_norm": 152.79394128500897, "learning_rate": 1.4735174122104476e-07, "logits/chosen": -1.5674101114273071, "logits/rejected": -0.8240894079208374, "logps/chosen": -246.6388397216797, "logps/rejected": -274.95709228515625, "loss": 0.79, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -7.239173889160156, "rewards/margins": 68.80106353759766, "rewards/rejected": -76.04023742675781, "step": 3420 }, { "epoch": 0.6732090284592738, "grad_norm": 85.19393801663672, "learning_rate": 1.457922421043943e-07, "logits/chosen": -2.1374614238739014, "logits/rejected": -1.2740757465362549, "logps/chosen": -332.84588623046875, "logps/rejected": -255.8601531982422, "loss": 0.7883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -12.803683280944824, "rewards/margins": 37.676639556884766, "rewards/rejected": -50.480323791503906, "step": 3430 }, { "epoch": 0.6751717369970559, "grad_norm": 114.20219488545818, "learning_rate": 1.4423763535913704e-07, "logits/chosen": -2.3347208499908447, "logits/rejected": -1.697603464126587, "logps/chosen": -258.1283874511719, "logps/rejected": -326.790283203125, "loss": 0.7095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 9.315744400024414, "rewards/margins": 68.07012939453125, "rewards/rejected": -58.7543830871582, "step": 3440 }, { "epoch": 0.677134445534838, "grad_norm": 74.27070727942997, "learning_rate": 1.426879939713322e-07, "logits/chosen": -2.1975417137145996, "logits/rejected": -1.8118565082550049, "logps/chosen": -304.7396545410156, "logps/rejected": -300.7691955566406, "loss": 0.7392, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.8934723138809204, "rewards/margins": 64.23905181884766, "rewards/rejected": -65.13252258300781, "step": 3450 }, { "epoch": 0.6790971540726202, "grad_norm": 149.9665157810247, "learning_rate": 1.4114339069392374e-07, "logits/chosen": -2.324371814727783, "logits/rejected": -1.4152635335922241, "logps/chosen": -287.4292907714844, "logps/rejected": -275.90118408203125, "loss": 0.6728, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 9.25989818572998, "rewards/margins": 73.53782653808594, "rewards/rejected": -64.2779312133789, "step": 3460 }, { "epoch": 0.6810598626104023, "grad_norm": 95.94417219099574, "learning_rate": 1.3960389804332556e-07, "logits/chosen": -2.220851421356201, "logits/rejected": -1.4349888563156128, "logps/chosen": -249.00820922851562, "logps/rejected": -356.7861022949219, "loss": 0.7581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 8.134378433227539, "rewards/margins": 73.06588745117188, "rewards/rejected": -64.9315185546875, "step": 3470 }, { "epoch": 0.6830225711481845, "grad_norm": 152.33055578021725, "learning_rate": 1.380695882960165e-07, "logits/chosen": -2.0812017917633057, "logits/rejected": -1.6904922723770142, "logps/chosen": -273.93426513671875, "logps/rejected": -295.7121887207031, "loss": 0.7459, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -0.9489307403564453, "rewards/margins": 67.77001190185547, "rewards/rejected": -68.71894836425781, "step": 3480 }, { "epoch": 0.6849852796859667, "grad_norm": 136.91486238649046, "learning_rate": 1.3654053348514702e-07, "logits/chosen": -1.7881940603256226, "logits/rejected": -1.252142310142517, "logps/chosen": -154.3620147705078, "logps/rejected": -258.3116455078125, "loss": 0.6839, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 10.162893295288086, "rewards/margins": 66.80345153808594, "rewards/rejected": -56.64055633544922, "step": 3490 }, { "epoch": 0.6869479882237488, "grad_norm": 104.3656843062372, "learning_rate": 1.350168053971577e-07, "logits/chosen": -2.0309269428253174, "logits/rejected": -1.040135383605957, "logps/chosen": -358.5278625488281, "logps/rejected": -309.7778015136719, "loss": 0.7573, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 3.980185031890869, "rewards/margins": 74.04118347167969, "rewards/rejected": -70.06099700927734, "step": 3500 }, { "epoch": 0.6889106967615309, "grad_norm": 119.76957268992976, "learning_rate": 1.3349847556840876e-07, "logits/chosen": -1.9207813739776611, "logits/rejected": -1.4940179586410522, "logps/chosen": -253.9391632080078, "logps/rejected": -334.3329772949219, "loss": 0.7085, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -8.582676887512207, "rewards/margins": 59.07757568359375, "rewards/rejected": -67.6602554321289, "step": 3510 }, { "epoch": 0.6908734052993131, "grad_norm": 71.09083330073716, "learning_rate": 1.3198561528182182e-07, "logits/chosen": -1.6225411891937256, "logits/rejected": -1.2249433994293213, "logps/chosen": -223.7402801513672, "logps/rejected": -290.65521240234375, "loss": 0.8386, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -24.693653106689453, "rewards/margins": 42.28459930419922, "rewards/rejected": -66.97825622558594, "step": 3520 }, { "epoch": 0.6928361138370952, "grad_norm": 81.90000851307036, "learning_rate": 1.3047829556353263e-07, "logits/chosen": -2.1534814834594727, "logits/rejected": -1.5600712299346924, "logps/chosen": -250.3526153564453, "logps/rejected": -296.6809997558594, "loss": 0.7297, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 2.474473476409912, "rewards/margins": 59.12791061401367, "rewards/rejected": -56.6534423828125, "step": 3530 }, { "epoch": 0.6947988223748773, "grad_norm": 120.02615897104567, "learning_rate": 1.2897658717955742e-07, "logits/chosen": -1.8396505117416382, "logits/rejected": -1.0469920635223389, "logps/chosen": -268.4101257324219, "logps/rejected": -292.49139404296875, "loss": 0.6806, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -6.083364963531494, "rewards/margins": 72.87593841552734, "rewards/rejected": -78.95929718017578, "step": 3540 }, { "epoch": 0.6967615309126595, "grad_norm": 311.250447790443, "learning_rate": 1.2748056063246994e-07, "logits/chosen": -1.9446853399276733, "logits/rejected": -1.6984657049179077, "logps/chosen": -311.6482238769531, "logps/rejected": -332.0123596191406, "loss": 0.8195, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9.940755844116211, "rewards/margins": 32.68222427368164, "rewards/rejected": -42.62297821044922, "step": 3550 }, { "epoch": 0.6987242394504416, "grad_norm": 95.99486220388819, "learning_rate": 1.2599028615809183e-07, "logits/chosen": -2.1044070720672607, "logits/rejected": -1.250933051109314, "logps/chosen": -287.5272521972656, "logps/rejected": -294.6505432128906, "loss": 0.7716, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": 9.252132415771484, "rewards/margins": 65.3217544555664, "rewards/rejected": -56.06962203979492, "step": 3560 }, { "epoch": 0.7006869479882237, "grad_norm": 142.16609436063985, "learning_rate": 1.2450583372219458e-07, "logits/chosen": -1.8274517059326172, "logits/rejected": -1.6515783071517944, "logps/chosen": -295.6714782714844, "logps/rejected": -337.58880615234375, "loss": 0.7054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.8436713218688965, "rewards/margins": 49.22053909301758, "rewards/rejected": -42.376869201660156, "step": 3570 }, { "epoch": 0.7026496565260059, "grad_norm": 132.67830545022585, "learning_rate": 1.230272730172157e-07, "logits/chosen": -1.9185707569122314, "logits/rejected": -1.4781296253204346, "logps/chosen": -299.1072082519531, "logps/rejected": -370.0293273925781, "loss": 0.7909, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -2.690268039703369, "rewards/margins": 59.68656539916992, "rewards/rejected": -62.3768310546875, "step": 3580 }, { "epoch": 0.704612365063788, "grad_norm": 172.7905265801041, "learning_rate": 1.2155467345898602e-07, "logits/chosen": -1.9594173431396484, "logits/rejected": -1.7599966526031494, "logps/chosen": -282.20452880859375, "logps/rejected": -340.27801513671875, "loss": 0.8609, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -14.746014595031738, "rewards/margins": 12.017654418945312, "rewards/rejected": -26.7636661529541, "step": 3590 }, { "epoch": 0.7065750736015701, "grad_norm": 90.02563892278901, "learning_rate": 1.2008810418347093e-07, "logits/chosen": -2.0893871784210205, "logits/rejected": -1.8588151931762695, "logps/chosen": -192.67343139648438, "logps/rejected": -254.4624481201172, "loss": 0.7103, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 8.384778022766113, "rewards/margins": 51.04999542236328, "rewards/rejected": -42.66521453857422, "step": 3600 }, { "epoch": 0.7085377821393523, "grad_norm": 118.65548403178747, "learning_rate": 1.1862763404352483e-07, "logits/chosen": -2.3507723808288574, "logits/rejected": -1.472481608390808, "logps/chosen": -318.70611572265625, "logps/rejected": -338.7253112792969, "loss": 0.6824, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -4.6282124519348145, "rewards/margins": 54.795005798339844, "rewards/rejected": -59.4232177734375, "step": 3610 }, { "epoch": 0.7105004906771345, "grad_norm": 82.38652713931373, "learning_rate": 1.1717333160565807e-07, "logits/chosen": -2.2014546394348145, "logits/rejected": -1.7044570446014404, "logps/chosen": -357.2608947753906, "logps/rejected": -332.08135986328125, "loss": 0.7118, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.44674986600875854, "rewards/margins": 50.18072509765625, "rewards/rejected": -49.733978271484375, "step": 3620 }, { "epoch": 0.7124631992149166, "grad_norm": 110.58274592140684, "learning_rate": 1.1572526514681874e-07, "logits/chosen": -1.8637192249298096, "logits/rejected": -1.4561747312545776, "logps/chosen": -311.5350036621094, "logps/rejected": -383.87493896484375, "loss": 0.8103, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -13.515321731567383, "rewards/margins": 42.28776931762695, "rewards/rejected": -55.80309295654297, "step": 3630 }, { "epoch": 0.7144259077526988, "grad_norm": 133.73139883695544, "learning_rate": 1.1428350265118613e-07, "logits/chosen": -2.2391510009765625, "logits/rejected": -1.4630959033966064, "logps/chosen": -321.3340759277344, "logps/rejected": -335.08209228515625, "loss": 0.7541, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -4.154849052429199, "rewards/margins": 53.90868377685547, "rewards/rejected": -58.06352996826172, "step": 3640 }, { "epoch": 0.7163886162904809, "grad_norm": 116.70324728707418, "learning_rate": 1.128481118069799e-07, "logits/chosen": -2.27311372756958, "logits/rejected": -0.9830902218818665, "logps/chosen": -269.22857666015625, "logps/rejected": -365.013427734375, "loss": 0.695, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 1.8014949560165405, "rewards/margins": 91.56087493896484, "rewards/rejected": -89.75938415527344, "step": 3650 }, { "epoch": 0.718351324828263, "grad_norm": 69.84365349693351, "learning_rate": 1.114191600032815e-07, "logits/chosen": -2.2708144187927246, "logits/rejected": -1.3350690603256226, "logps/chosen": -323.60638427734375, "logps/rejected": -336.3446044921875, "loss": 0.745, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -11.991853713989258, "rewards/margins": 51.13048553466797, "rewards/rejected": -63.122337341308594, "step": 3660 }, { "epoch": 0.7203140333660452, "grad_norm": 78.26824832399404, "learning_rate": 1.0999671432687099e-07, "logits/chosen": -2.2681617736816406, "logits/rejected": -1.0384986400604248, "logps/chosen": -296.62017822265625, "logps/rejected": -281.9424743652344, "loss": 0.7393, "rewards/accuracies": 0.73333340883255, "rewards/chosen": 2.313171863555908, "rewards/margins": 47.61670684814453, "rewards/rejected": -45.30353546142578, "step": 3670 }, { "epoch": 0.7222767419038273, "grad_norm": 100.38307095703568, "learning_rate": 1.085808415590772e-07, "logits/chosen": -2.2477574348449707, "logits/rejected": -1.7549083232879639, "logps/chosen": -281.59161376953125, "logps/rejected": -305.6994934082031, "loss": 0.7651, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 4.167940616607666, "rewards/margins": 62.10760498046875, "rewards/rejected": -57.939674377441406, "step": 3680 }, { "epoch": 0.7242394504416094, "grad_norm": 115.9868066033916, "learning_rate": 1.0717160817264217e-07, "logits/chosen": -2.1291849613189697, "logits/rejected": -1.1697852611541748, "logps/chosen": -287.45367431640625, "logps/rejected": -353.3840026855469, "loss": 0.6226, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -14.975071907043457, "rewards/margins": 69.79930114746094, "rewards/rejected": -84.77436828613281, "step": 3690 }, { "epoch": 0.7262021589793916, "grad_norm": 100.55129569910939, "learning_rate": 1.0576908032860088e-07, "logits/chosen": -1.6502158641815186, "logits/rejected": -1.3260905742645264, "logps/chosen": -259.0820617675781, "logps/rejected": -269.95233154296875, "loss": 0.7173, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -2.5750763416290283, "rewards/margins": 60.36382293701172, "rewards/rejected": -62.93889617919922, "step": 3700 }, { "epoch": 0.7281648675171737, "grad_norm": 123.39088313593878, "learning_rate": 1.0437332387317474e-07, "logits/chosen": -2.3671140670776367, "logits/rejected": -1.624356985092163, "logps/chosen": -241.73068237304688, "logps/rejected": -266.251708984375, "loss": 0.7993, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 3.7030346393585205, "rewards/margins": 58.035552978515625, "rewards/rejected": -54.33251953125, "step": 3710 }, { "epoch": 0.7301275760549558, "grad_norm": 79.10018834077098, "learning_rate": 1.0298440433468048e-07, "logits/chosen": -2.511042833328247, "logits/rejected": -1.6425892114639282, "logps/chosen": -331.259033203125, "logps/rejected": -314.8053283691406, "loss": 0.756, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 6.262584209442139, "rewards/margins": 68.92805480957031, "rewards/rejected": -62.66546630859375, "step": 3720 }, { "epoch": 0.732090284592738, "grad_norm": 66.35586812701564, "learning_rate": 1.0160238692045331e-07, "logits/chosen": -2.239853620529175, "logits/rejected": -1.1930513381958008, "logps/chosen": -255.0226287841797, "logps/rejected": -272.3048400878906, "loss": 0.739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.6339752674102783, "rewards/margins": 59.85243606567383, "rewards/rejected": -63.486412048339844, "step": 3730 }, { "epoch": 0.7340529931305201, "grad_norm": 107.05866372934094, "learning_rate": 1.0022733651378606e-07, "logits/chosen": -2.341245651245117, "logits/rejected": -1.1085654497146606, "logps/chosen": -371.851806640625, "logps/rejected": -346.4950256347656, "loss": 0.759, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 1.0952996015548706, "rewards/margins": 77.58404541015625, "rewards/rejected": -76.48875427246094, "step": 3740 }, { "epoch": 0.7360157016683022, "grad_norm": 199.67462962256974, "learning_rate": 9.88593176708827e-08, "logits/chosen": -2.1387505531311035, "logits/rejected": -1.8481948375701904, "logps/chosen": -277.35211181640625, "logps/rejected": -330.470458984375, "loss": 0.7458, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -2.0741686820983887, "rewards/margins": 42.64722442626953, "rewards/rejected": -44.72139358520508, "step": 3750 }, { "epoch": 0.7379784102060843, "grad_norm": 117.55231040406439, "learning_rate": 9.749839461782769e-08, "logits/chosen": -2.384817123413086, "logits/rejected": -1.8358865976333618, "logps/chosen": -251.8495330810547, "logps/rejected": -365.1477966308594, "loss": 0.6756, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -2.422518014907837, "rewards/margins": 70.43353271484375, "rewards/rejected": -72.8560562133789, "step": 3760 }, { "epoch": 0.7399411187438666, "grad_norm": 127.94903767499298, "learning_rate": 9.614463124757041e-08, "logits/chosen": -2.1502859592437744, "logits/rejected": -1.956716537475586, "logps/chosen": -231.34188842773438, "logps/rejected": -269.8093566894531, "loss": 0.8405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.806567192077637, "rewards/margins": 41.57996368408203, "rewards/rejected": -36.77339553833008, "step": 3770 }, { "epoch": 0.7419038272816487, "grad_norm": 110.36715927788727, "learning_rate": 9.479809111692586e-08, "logits/chosen": -2.2960991859436035, "logits/rejected": -1.8740803003311157, "logps/chosen": -244.6845703125, "logps/rejected": -317.9368591308594, "loss": 0.7395, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9.526605606079102, "rewards/margins": 26.8959903717041, "rewards/rejected": -36.42259216308594, "step": 3780 }, { "epoch": 0.7438665358194309, "grad_norm": 67.55049400023431, "learning_rate": 9.345883744359065e-08, "logits/chosen": -2.305586338043213, "logits/rejected": -1.9559495449066162, "logps/chosen": -272.88287353515625, "logps/rejected": -368.8515625, "loss": 0.7809, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.8336777687072754, "rewards/margins": 29.904748916625977, "rewards/rejected": -32.738426208496094, "step": 3790 }, { "epoch": 0.745829244357213, "grad_norm": 150.19941515760067, "learning_rate": 9.212693310317479e-08, "logits/chosen": -2.3314685821533203, "logits/rejected": -1.7729047536849976, "logps/chosen": -259.419677734375, "logps/rejected": -296.94183349609375, "loss": 0.7802, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5.721223831176758, "rewards/margins": 49.159786224365234, "rewards/rejected": -54.881004333496094, "step": 3800 }, { "epoch": 0.7477919528949951, "grad_norm": 132.01727631637365, "learning_rate": 9.08024406262503e-08, "logits/chosen": -2.4061217308044434, "logits/rejected": -1.456063151359558, "logps/chosen": -242.51016235351562, "logps/rejected": -309.99969482421875, "loss": 0.7043, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 2.375234365463257, "rewards/margins": 62.546791076660156, "rewards/rejected": -60.17156219482422, "step": 3810 }, { "epoch": 0.7497546614327772, "grad_norm": 109.01412310359188, "learning_rate": 8.94854221954148e-08, "logits/chosen": -2.3203864097595215, "logits/rejected": -1.7620103359222412, "logps/chosen": -223.9595184326172, "logps/rejected": -266.5745849609375, "loss": 0.6763, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 9.473299980163574, "rewards/margins": 71.24612426757812, "rewards/rejected": -61.7728271484375, "step": 3820 }, { "epoch": 0.7517173699705594, "grad_norm": 109.57293406005094, "learning_rate": 8.817593964237316e-08, "logits/chosen": -2.3590502738952637, "logits/rejected": -1.6860431432724, "logps/chosen": -271.23248291015625, "logps/rejected": -302.7469177246094, "loss": 0.6405, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 8.07580280303955, "rewards/margins": 67.08293914794922, "rewards/rejected": -59.00714111328125, "step": 3830 }, { "epoch": 0.7536800785083415, "grad_norm": 133.68371613692528, "learning_rate": 8.68740544450334e-08, "logits/chosen": -2.318648099899292, "logits/rejected": -1.2320307493209839, "logps/chosen": -355.879638671875, "logps/rejected": -331.7142028808594, "loss": 0.7545, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 4.4558305740356445, "rewards/margins": 67.81806945800781, "rewards/rejected": -63.36225128173828, "step": 3840 }, { "epoch": 0.7556427870461236, "grad_norm": 71.47923960831709, "learning_rate": 8.557982772462138e-08, "logits/chosen": -1.8121833801269531, "logits/rejected": -1.0061269998550415, "logps/chosen": -259.7107849121094, "logps/rejected": -333.7269287109375, "loss": 0.6315, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 3.1760687828063965, "rewards/margins": 85.63246154785156, "rewards/rejected": -82.45638275146484, "step": 3850 }, { "epoch": 0.7576054955839058, "grad_norm": 91.34848906522774, "learning_rate": 8.429332024281088e-08, "logits/chosen": -2.1218459606170654, "logits/rejected": -1.1168310642242432, "logps/chosen": -282.3860168457031, "logps/rejected": -294.08612060546875, "loss": 0.7184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.520068407058716, "rewards/margins": 80.72539520263672, "rewards/rejected": -77.20532989501953, "step": 3860 }, { "epoch": 0.7595682041216879, "grad_norm": 109.27658164301164, "learning_rate": 8.301459239887073e-08, "logits/chosen": -2.0389952659606934, "logits/rejected": -1.498354196548462, "logps/chosen": -344.1943359375, "logps/rejected": -322.50347900390625, "loss": 0.706, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -4.331395626068115, "rewards/margins": 52.150840759277344, "rewards/rejected": -56.48223876953125, "step": 3870 }, { "epoch": 0.76153091265947, "grad_norm": 171.50156269309159, "learning_rate": 8.17437042268298e-08, "logits/chosen": -2.0260977745056152, "logits/rejected": -1.7909427881240845, "logps/chosen": -300.59674072265625, "logps/rejected": -354.51214599609375, "loss": 0.7625, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -11.87992000579834, "rewards/margins": 36.625667572021484, "rewards/rejected": -48.505592346191406, "step": 3880 }, { "epoch": 0.7634936211972522, "grad_norm": 114.67074031454858, "learning_rate": 8.048071539265761e-08, "logits/chosen": -2.144007444381714, "logits/rejected": -0.9628366231918335, "logps/chosen": -328.22552490234375, "logps/rejected": -291.3396911621094, "loss": 0.7745, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -13.772653579711914, "rewards/margins": 46.18323516845703, "rewards/rejected": -59.955894470214844, "step": 3890 }, { "epoch": 0.7654563297350343, "grad_norm": 135.8563318084041, "learning_rate": 7.922568519146425e-08, "logits/chosen": -1.9455242156982422, "logits/rejected": -1.1141116619110107, "logps/chosen": -214.89834594726562, "logps/rejected": -294.12945556640625, "loss": 0.6423, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 11.858541488647461, "rewards/margins": 100.84459686279297, "rewards/rejected": -88.98606872558594, "step": 3900 }, { "epoch": 0.7674190382728164, "grad_norm": 153.53756347292054, "learning_rate": 7.79786725447154e-08, "logits/chosen": -1.8343238830566406, "logits/rejected": -0.830994725227356, "logps/chosen": -261.7711181640625, "logps/rejected": -296.99383544921875, "loss": 0.6555, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -1.874410629272461, "rewards/margins": 79.46246337890625, "rewards/rejected": -81.33686828613281, "step": 3910 }, { "epoch": 0.7693817468105987, "grad_norm": 118.51252222498366, "learning_rate": 7.6739735997467e-08, "logits/chosen": -2.3611745834350586, "logits/rejected": -1.8738129138946533, "logps/chosen": -284.49322509765625, "logps/rejected": -324.45245361328125, "loss": 0.6765, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 13.305956840515137, "rewards/margins": 77.57592010498047, "rewards/rejected": -64.26997375488281, "step": 3920 }, { "epoch": 0.7713444553483808, "grad_norm": 165.53541186196705, "learning_rate": 7.550893371561593e-08, "logits/chosen": -1.8517277240753174, "logits/rejected": -1.401582956314087, "logps/chosen": -251.0609130859375, "logps/rejected": -303.97686767578125, "loss": 0.6916, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 17.28457260131836, "rewards/margins": 76.52352905273438, "rewards/rejected": -59.23895263671875, "step": 3930 }, { "epoch": 0.7733071638861629, "grad_norm": 68.00527459109568, "learning_rate": 7.428632348317004e-08, "logits/chosen": -1.9195239543914795, "logits/rejected": -1.2309167385101318, "logps/chosen": -254.51351928710938, "logps/rejected": -324.16888427734375, "loss": 0.677, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6.105473041534424, "rewards/margins": 59.58129119873047, "rewards/rejected": -65.686767578125, "step": 3940 }, { "epoch": 0.7752698724239451, "grad_norm": 138.93041618809494, "learning_rate": 7.307196269953444e-08, "logits/chosen": -2.3644955158233643, "logits/rejected": -1.5950266122817993, "logps/chosen": -275.66961669921875, "logps/rejected": -324.525390625, "loss": 0.69, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 7.426495552062988, "rewards/margins": 69.17024993896484, "rewards/rejected": -61.74375534057617, "step": 3950 }, { "epoch": 0.7772325809617272, "grad_norm": 103.24331337613977, "learning_rate": 7.186590837681732e-08, "logits/chosen": -1.9627310037612915, "logits/rejected": -1.0624157190322876, "logps/chosen": -269.38970947265625, "logps/rejected": -269.0195007324219, "loss": 0.724, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6.550607204437256, "rewards/margins": 55.243072509765625, "rewards/rejected": -61.7936897277832, "step": 3960 }, { "epoch": 0.7791952894995093, "grad_norm": 93.37859675015339, "learning_rate": 7.066821713715293e-08, "logits/chosen": -2.0666093826293945, "logits/rejected": -1.1446895599365234, "logps/chosen": -310.9494934082031, "logps/rejected": -361.5001525878906, "loss": 0.632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9389166831970215, "rewards/margins": 79.60968780517578, "rewards/rejected": -75.6707763671875, "step": 3970 }, { "epoch": 0.7811579980372915, "grad_norm": 91.62269229890272, "learning_rate": 6.947894521004357e-08, "logits/chosen": -2.073216438293457, "logits/rejected": -2.009721040725708, "logps/chosen": -280.3326416015625, "logps/rejected": -328.5557861328125, "loss": 0.7206, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.743980050086975, "rewards/margins": 38.09291458129883, "rewards/rejected": -36.348934173583984, "step": 3980 }, { "epoch": 0.7831207065750736, "grad_norm": 204.32896299146762, "learning_rate": 6.829814842971965e-08, "logits/chosen": -1.9361683130264282, "logits/rejected": -1.5902702808380127, "logps/chosen": -239.34573364257812, "logps/rejected": -295.74468994140625, "loss": 0.7993, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -13.22026252746582, "rewards/margins": 34.806663513183594, "rewards/rejected": -48.02693176269531, "step": 3990 }, { "epoch": 0.7850834151128557, "grad_norm": 166.2852040099444, "learning_rate": 6.712588223251809e-08, "logits/chosen": -2.180363893508911, "logits/rejected": -1.785426378250122, "logps/chosen": -341.8576965332031, "logps/rejected": -335.52093505859375, "loss": 0.7797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2243255376815796, "rewards/margins": 54.4521598815918, "rewards/rejected": -55.676490783691406, "step": 4000 }, { "epoch": 0.7870461236506379, "grad_norm": 86.89154704336612, "learning_rate": 6.596220165428002e-08, "logits/chosen": -2.0386528968811035, "logits/rejected": -1.1896815299987793, "logps/chosen": -247.26931762695312, "logps/rejected": -323.0134582519531, "loss": 0.6682, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.7489330768585205, "rewards/margins": 76.64961242675781, "rewards/rejected": -74.90068054199219, "step": 4010 }, { "epoch": 0.78900883218842, "grad_norm": 94.43064143875763, "learning_rate": 6.48071613277669e-08, "logits/chosen": -2.0176053047180176, "logits/rejected": -1.6793369054794312, "logps/chosen": -244.49893188476562, "logps/rejected": -317.7451477050781, "loss": 0.7922, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -4.502781867980957, "rewards/margins": 45.09677505493164, "rewards/rejected": -49.59955596923828, "step": 4020 }, { "epoch": 0.7909715407262021, "grad_norm": 83.89747910873355, "learning_rate": 6.366081548009553e-08, "logits/chosen": -1.7048943042755127, "logits/rejected": -1.368273377418518, "logps/chosen": -281.6922607421875, "logps/rejected": -314.8491516113281, "loss": 0.8008, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -14.748578071594238, "rewards/margins": 39.09671401977539, "rewards/rejected": -53.84529495239258, "step": 4030 }, { "epoch": 0.7929342492639843, "grad_norm": 89.59162303048441, "learning_rate": 6.252321793019192e-08, "logits/chosen": -1.9720007181167603, "logits/rejected": -1.5491199493408203, "logps/chosen": -224.3168182373047, "logps/rejected": -300.7829895019531, "loss": 0.7103, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 7.4193596839904785, "rewards/margins": 60.46013641357422, "rewards/rejected": -53.0407829284668, "step": 4040 }, { "epoch": 0.7948969578017664, "grad_norm": 76.22156435980037, "learning_rate": 6.139442208626517e-08, "logits/chosen": -1.6844263076782227, "logits/rejected": -0.8048423528671265, "logps/chosen": -202.3559112548828, "logps/rejected": -228.9807586669922, "loss": 0.7947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -10.484501838684082, "rewards/margins": 58.60529708862305, "rewards/rejected": -69.08979797363281, "step": 4050 }, { "epoch": 0.7968596663395485, "grad_norm": 95.34292010858903, "learning_rate": 6.027448094329963e-08, "logits/chosen": -2.3162643909454346, "logits/rejected": -1.769240140914917, "logps/chosen": -233.7009735107422, "logps/rejected": -313.31439208984375, "loss": 0.7503, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 9.38758659362793, "rewards/margins": 50.230960845947266, "rewards/rejected": -40.84337615966797, "step": 4060 }, { "epoch": 0.7988223748773308, "grad_norm": 87.55901086848036, "learning_rate": 5.916344708056681e-08, "logits/chosen": -1.9378993511199951, "logits/rejected": -1.4301038980484009, "logps/chosen": -271.2376403808594, "logps/rejected": -298.01019287109375, "loss": 0.732, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2.0590720176696777, "rewards/margins": 63.57319259643555, "rewards/rejected": -65.63226318359375, "step": 4070 }, { "epoch": 0.8007850834151129, "grad_norm": 161.22877426488552, "learning_rate": 5.8061372659157306e-08, "logits/chosen": -2.2308640480041504, "logits/rejected": -1.6494734287261963, "logps/chosen": -335.3804626464844, "logps/rejected": -373.3377380371094, "loss": 0.6568, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 15.03607177734375, "rewards/margins": 59.40694046020508, "rewards/rejected": -44.37086868286133, "step": 4080 }, { "epoch": 0.802747791952895, "grad_norm": 151.25681472076036, "learning_rate": 5.6968309419531376e-08, "logits/chosen": -2.0670855045318604, "logits/rejected": -1.853826880455017, "logps/chosen": -287.1033935546875, "logps/rejected": -293.79669189453125, "loss": 0.8216, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 2.07009220123291, "rewards/margins": 43.942901611328125, "rewards/rejected": -41.87281036376953, "step": 4090 }, { "epoch": 0.8047105004906772, "grad_norm": 176.49836346768262, "learning_rate": 5.5884308679090525e-08, "logits/chosen": -2.0086522102355957, "logits/rejected": -0.2063540667295456, "logps/chosen": -254.56005859375, "logps/rejected": -276.60748291015625, "loss": 0.6972, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.4778265953063965, "rewards/margins": 85.01644134521484, "rewards/rejected": -92.49427795410156, "step": 4100 }, { "epoch": 0.8066732090284593, "grad_norm": 94.7146682207041, "learning_rate": 5.480942132976732e-08, "logits/chosen": -2.246140241622925, "logits/rejected": -1.1346241235733032, "logps/chosen": -305.69525146484375, "logps/rejected": -247.7290802001953, "loss": 0.7019, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 15.91126823425293, "rewards/margins": 69.90131378173828, "rewards/rejected": -53.99004364013672, "step": 4110 }, { "epoch": 0.8086359175662414, "grad_norm": 174.38115507421878, "learning_rate": 5.374369783563698e-08, "logits/chosen": -2.088773250579834, "logits/rejected": -1.3973766565322876, "logps/chosen": -285.46368408203125, "logps/rejected": -355.5546569824219, "loss": 0.6734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.30733585357666, "rewards/margins": 69.91325378417969, "rewards/rejected": -76.22058868408203, "step": 4120 }, { "epoch": 0.8105986261040236, "grad_norm": 77.45870777102209, "learning_rate": 5.268718823054752e-08, "logits/chosen": -2.1801886558532715, "logits/rejected": -1.381476879119873, "logps/chosen": -261.764404296875, "logps/rejected": -320.59613037109375, "loss": 0.7451, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -15.550443649291992, "rewards/margins": 57.4810676574707, "rewards/rejected": -73.03150939941406, "step": 4130 }, { "epoch": 0.8125613346418057, "grad_norm": 97.7097052232471, "learning_rate": 5.1639942115771384e-08, "logits/chosen": -1.8034086227416992, "logits/rejected": -2.0853657722473145, "logps/chosen": -245.40231323242188, "logps/rejected": -258.2283630371094, "loss": 0.7598, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -12.450443267822266, "rewards/margins": 28.371212005615234, "rewards/rejected": -40.821659088134766, "step": 4140 }, { "epoch": 0.8145240431795878, "grad_norm": 131.72840261069902, "learning_rate": 5.060200865767605e-08, "logits/chosen": -2.154517889022827, "logits/rejected": -1.302539587020874, "logps/chosen": -367.76995849609375, "logps/rejected": -372.68817138671875, "loss": 0.6827, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 5.895370006561279, "rewards/margins": 75.14876556396484, "rewards/rejected": -69.2533950805664, "step": 4150 }, { "epoch": 0.81648675171737, "grad_norm": 96.47188829291967, "learning_rate": 4.957343658541632e-08, "logits/chosen": -2.127027988433838, "logits/rejected": -1.0377602577209473, "logps/chosen": -228.1944580078125, "logps/rejected": -353.79034423828125, "loss": 0.6853, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 3.0734543800354004, "rewards/margins": 87.21350860595703, "rewards/rejected": -84.14005279541016, "step": 4160 }, { "epoch": 0.8184494602551521, "grad_norm": 238.99525080447057, "learning_rate": 4.8554274188646215e-08, "logits/chosen": -2.14247465133667, "logits/rejected": -1.1456241607666016, "logps/chosen": -271.426513671875, "logps/rejected": -276.2561950683594, "loss": 0.7894, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -11.255699157714844, "rewards/margins": 54.667327880859375, "rewards/rejected": -65.92302703857422, "step": 4170 }, { "epoch": 0.8204121687929342, "grad_norm": 126.96144349134515, "learning_rate": 4.754456931525208e-08, "logits/chosen": -1.9185256958007812, "logits/rejected": -1.5158227682113647, "logps/chosen": -261.7303771972656, "logps/rejected": -315.32586669921875, "loss": 0.6502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.830878257751465, "rewards/margins": 51.277183532714844, "rewards/rejected": -45.44630432128906, "step": 4180 }, { "epoch": 0.8223748773307163, "grad_norm": 107.09390664473455, "learning_rate": 4.654436936910622e-08, "logits/chosen": -2.247661828994751, "logits/rejected": -1.231286883354187, "logps/chosen": -308.06390380859375, "logps/rejected": -335.50714111328125, "loss": 0.7442, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.679135799407959, "rewards/margins": 86.95399475097656, "rewards/rejected": -90.63313293457031, "step": 4190 }, { "epoch": 0.8243375858684985, "grad_norm": 75.05067744341699, "learning_rate": 4.555372130784102e-08, "logits/chosen": -2.291424512863159, "logits/rejected": -1.6245908737182617, "logps/chosen": -384.3785705566406, "logps/rejected": -352.60760498046875, "loss": 0.6748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7911860942840576, "rewards/margins": 49.311180114746094, "rewards/rejected": -53.10236358642578, "step": 4200 }, { "epoch": 0.8263002944062807, "grad_norm": 80.06055583555016, "learning_rate": 4.45726716406449e-08, "logits/chosen": -2.2434096336364746, "logits/rejected": -2.28462553024292, "logps/chosen": -310.5040588378906, "logps/rejected": -285.14984130859375, "loss": 0.8215, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -1.5155730247497559, "rewards/margins": 25.251733779907227, "rewards/rejected": -26.76730728149414, "step": 4210 }, { "epoch": 0.8282630029440629, "grad_norm": 112.97765078209706, "learning_rate": 4.360126642607842e-08, "logits/chosen": -2.2808432579040527, "logits/rejected": -1.1301907300949097, "logps/chosen": -329.64801025390625, "logps/rejected": -306.8882751464844, "loss": 0.6904, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7564271688461304, "rewards/margins": 69.74730682373047, "rewards/rejected": -68.99088287353516, "step": 4220 }, { "epoch": 0.830225711481845, "grad_norm": 142.14179774074765, "learning_rate": 4.2639551269912034e-08, "logits/chosen": -2.0478198528289795, "logits/rejected": -1.7996965646743774, "logps/chosen": -204.26034545898438, "logps/rejected": -222.8451690673828, "loss": 0.769, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 4.713677883148193, "rewards/margins": 36.44514846801758, "rewards/rejected": -31.731470108032227, "step": 4230 }, { "epoch": 0.8321884200196271, "grad_norm": 168.54125206910211, "learning_rate": 4.168757132298478e-08, "logits/chosen": -1.9967315196990967, "logits/rejected": -1.4878517389297485, "logps/chosen": -274.7620544433594, "logps/rejected": -326.80908203125, "loss": 0.8094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.931123733520508, "rewards/margins": 34.28893280029297, "rewards/rejected": -50.220054626464844, "step": 4240 }, { "epoch": 0.8341511285574092, "grad_norm": 103.73538036595869, "learning_rate": 4.0745371279084976e-08, "logits/chosen": -2.3339030742645264, "logits/rejected": -1.8180869817733765, "logps/chosen": -235.6595458984375, "logps/rejected": -270.8612060546875, "loss": 0.7294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 12.684198379516602, "rewards/margins": 59.22025680541992, "rewards/rejected": -46.53606414794922, "step": 4250 }, { "epoch": 0.8361138370951914, "grad_norm": 96.67993668496764, "learning_rate": 3.9812995372851544e-08, "logits/chosen": -2.17364239692688, "logits/rejected": -1.336732029914856, "logps/chosen": -248.5943603515625, "logps/rejected": -299.0509033203125, "loss": 0.7183, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4048044681549072, "rewards/margins": 73.05081176757812, "rewards/rejected": -74.45561218261719, "step": 4260 }, { "epoch": 0.8380765456329735, "grad_norm": 97.89481042127296, "learning_rate": 3.8890487377697265e-08, "logits/chosen": -2.3062121868133545, "logits/rejected": -1.6332759857177734, "logps/chosen": -250.26486206054688, "logps/rejected": -300.39739990234375, "loss": 0.6796, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 3.848731279373169, "rewards/margins": 66.36620330810547, "rewards/rejected": -62.51746368408203, "step": 4270 }, { "epoch": 0.8400392541707556, "grad_norm": 126.89250753603689, "learning_rate": 3.7977890603754e-08, "logits/chosen": -2.335864543914795, "logits/rejected": -1.4245611429214478, "logps/chosen": -329.94439697265625, "logps/rejected": -356.2707824707031, "loss": 0.7496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 19.062984466552734, "rewards/margins": 62.11712646484375, "rewards/rejected": -43.05413818359375, "step": 4280 }, { "epoch": 0.8420019627085378, "grad_norm": 73.54880361072654, "learning_rate": 3.707524789583891e-08, "logits/chosen": -2.2100203037261963, "logits/rejected": -1.2681772708892822, "logps/chosen": -313.56854248046875, "logps/rejected": -379.10797119140625, "loss": 0.6961, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.24000664055347443, "rewards/margins": 70.55604553222656, "rewards/rejected": -70.31602478027344, "step": 4290 }, { "epoch": 0.8439646712463199, "grad_norm": 85.05567547371574, "learning_rate": 3.6182601631443596e-08, "logits/chosen": -2.3917462825775146, "logits/rejected": -1.7588598728179932, "logps/chosen": -311.25921630859375, "logps/rejected": -328.1839904785156, "loss": 0.6713, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 12.088909149169922, "rewards/margins": 72.71896362304688, "rewards/rejected": -60.63005447387695, "step": 4300 }, { "epoch": 0.845927379784102, "grad_norm": 74.80822480149952, "learning_rate": 3.529999371874381e-08, "logits/chosen": -1.9063971042633057, "logits/rejected": -1.3725688457489014, "logps/chosen": -284.0364074707031, "logps/rejected": -314.7305603027344, "loss": 0.8259, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -2.501081943511963, "rewards/margins": 49.68852233886719, "rewards/rejected": -52.189598083496094, "step": 4310 }, { "epoch": 0.8478900883218842, "grad_norm": 175.46297468772428, "learning_rate": 3.4427465594632555e-08, "logits/chosen": -2.062887668609619, "logits/rejected": -1.0150609016418457, "logps/chosen": -192.48733520507812, "logps/rejected": -247.17459106445312, "loss": 0.6084, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 8.229732513427734, "rewards/margins": 77.38661193847656, "rewards/rejected": -69.1568832397461, "step": 4320 }, { "epoch": 0.8498527968596663, "grad_norm": 83.0201084975644, "learning_rate": 3.356505822277417e-08, "logits/chosen": -2.1239094734191895, "logits/rejected": -1.4656755924224854, "logps/chosen": -286.2351379394531, "logps/rejected": -308.7716064453125, "loss": 0.7916, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -16.147811889648438, "rewards/margins": 30.034753799438477, "rewards/rejected": -46.18256378173828, "step": 4330 }, { "epoch": 0.8518155053974484, "grad_norm": 164.27555944468708, "learning_rate": 3.271281209168186e-08, "logits/chosen": -2.0288209915161133, "logits/rejected": -0.4138711094856262, "logps/chosen": -286.59979248046875, "logps/rejected": -344.2161865234375, "loss": 0.7074, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -10.258631706237793, "rewards/margins": 117.56071472167969, "rewards/rejected": -127.8193359375, "step": 4340 }, { "epoch": 0.8537782139352306, "grad_norm": 401.8024833515605, "learning_rate": 3.187076721281595e-08, "logits/chosen": -2.2470884323120117, "logits/rejected": -1.6072273254394531, "logps/chosen": -244.5692901611328, "logps/rejected": -286.2551574707031, "loss": 0.7833, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 1.0845510959625244, "rewards/margins": 48.13280487060547, "rewards/rejected": -47.048255920410156, "step": 4350 }, { "epoch": 0.8557409224730128, "grad_norm": 145.02200014498902, "learning_rate": 3.1038963118706244e-08, "logits/chosen": -1.663010835647583, "logits/rejected": -0.8582938313484192, "logps/chosen": -281.4682922363281, "logps/rejected": -319.5342102050781, "loss": 0.675, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.079761981964111, "rewards/margins": 82.85012817382812, "rewards/rejected": -86.92988586425781, "step": 4360 }, { "epoch": 0.8577036310107949, "grad_norm": 111.66249813413052, "learning_rate": 3.0217438861095315e-08, "logits/chosen": -2.2533702850341797, "logits/rejected": -1.7338300943374634, "logps/chosen": -197.65830993652344, "logps/rejected": -270.019287109375, "loss": 0.7706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 8.909185409545898, "rewards/margins": 56.8540153503418, "rewards/rejected": -47.94483184814453, "step": 4370 }, { "epoch": 0.8596663395485771, "grad_norm": 116.84065927284414, "learning_rate": 2.940623300910572e-08, "logits/chosen": -2.5885159969329834, "logits/rejected": -0.3334166407585144, "logps/chosen": -308.74371337890625, "logps/rejected": -327.54132080078125, "loss": 0.6473, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 11.126081466674805, "rewards/margins": 136.9679718017578, "rewards/rejected": -125.84188079833984, "step": 4380 }, { "epoch": 0.8616290480863592, "grad_norm": 330.3815703640555, "learning_rate": 2.860538364742898e-08, "logits/chosen": -2.3705086708068848, "logits/rejected": -1.479203701019287, "logps/chosen": -364.359619140625, "logps/rejected": -276.20538330078125, "loss": 0.7757, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 5.474006652832031, "rewards/margins": 61.462684631347656, "rewards/rejected": -55.988685607910156, "step": 4390 }, { "epoch": 0.8635917566241413, "grad_norm": 119.85584116690788, "learning_rate": 2.7814928374537334e-08, "logits/chosen": -2.3972699642181396, "logits/rejected": -1.8536288738250732, "logps/chosen": -227.5499725341797, "logps/rejected": -252.0731964111328, "loss": 0.7547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.391343355178833, "rewards/margins": 47.683197021484375, "rewards/rejected": -49.07453918457031, "step": 4400 }, { "epoch": 0.8655544651619235, "grad_norm": 81.53583454471305, "learning_rate": 2.7034904300918982e-08, "logits/chosen": -1.928810477256775, "logits/rejected": -1.663252830505371, "logps/chosen": -225.41311645507812, "logps/rejected": -319.4206237792969, "loss": 0.7664, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -3.749385118484497, "rewards/margins": 52.845741271972656, "rewards/rejected": -56.595123291015625, "step": 4410 }, { "epoch": 0.8675171736997056, "grad_norm": 83.33203740578249, "learning_rate": 2.62653480473356e-08, "logits/chosen": -2.3722119331359863, "logits/rejected": -1.8683274984359741, "logps/chosen": -274.8641052246094, "logps/rejected": -307.0876770019531, "loss": 0.7302, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -10.882646560668945, "rewards/margins": 48.69913864135742, "rewards/rejected": -59.58177947998047, "step": 4420 }, { "epoch": 0.8694798822374877, "grad_norm": 113.1559500583454, "learning_rate": 2.550629574310309e-08, "logits/chosen": -1.932538390159607, "logits/rejected": -1.1332205533981323, "logps/chosen": -250.6973876953125, "logps/rejected": -353.84375, "loss": 0.6748, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -11.229366302490234, "rewards/margins": 59.25162887573242, "rewards/rejected": -70.48099517822266, "step": 4430 }, { "epoch": 0.8714425907752699, "grad_norm": 141.96626656457408, "learning_rate": 2.475778302439524e-08, "logits/chosen": -2.4903228282928467, "logits/rejected": -1.2155835628509521, "logps/chosen": -311.2945556640625, "logps/rejected": -343.14910888671875, "loss": 0.6712, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 5.359463214874268, "rewards/margins": 99.0355453491211, "rewards/rejected": -93.67609405517578, "step": 4440 }, { "epoch": 0.873405299313052, "grad_norm": 101.93161963870415, "learning_rate": 2.4019845032570875e-08, "logits/chosen": -2.3475632667541504, "logits/rejected": -1.5638697147369385, "logps/chosen": -265.65631103515625, "logps/rejected": -347.5047912597656, "loss": 0.6539, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 6.48675537109375, "rewards/margins": 76.58488464355469, "rewards/rejected": -70.09814453125, "step": 4450 }, { "epoch": 0.8753680078508341, "grad_norm": 142.4396587470619, "learning_rate": 2.3292516412524054e-08, "logits/chosen": -2.3241803646087646, "logits/rejected": -1.3707834482192993, "logps/chosen": -312.6804504394531, "logps/rejected": -279.3455505371094, "loss": 0.7604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.92746925354004, "rewards/margins": 41.50196075439453, "rewards/rejected": -59.4294319152832, "step": 4460 }, { "epoch": 0.8773307163886163, "grad_norm": 121.49273996062102, "learning_rate": 2.2575831311057225e-08, "logits/chosen": -2.235206127166748, "logits/rejected": -1.1417607069015503, "logps/chosen": -232.7095184326172, "logps/rejected": -329.55523681640625, "loss": 0.7582, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6491926908493042, "rewards/margins": 96.53202056884766, "rewards/rejected": -95.88282775878906, "step": 4470 }, { "epoch": 0.8792934249263984, "grad_norm": 122.65751640085026, "learning_rate": 2.1869823375278483e-08, "logits/chosen": -1.9124761819839478, "logits/rejected": -0.03196978196501732, "logps/chosen": -229.1486358642578, "logps/rejected": -339.6845397949219, "loss": 0.6888, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -15.792703628540039, "rewards/margins": 121.9757080078125, "rewards/rejected": -137.76840209960938, "step": 4480 }, { "epoch": 0.8812561334641805, "grad_norm": 157.54138298909007, "learning_rate": 2.1174525751021578e-08, "logits/chosen": -2.098029851913452, "logits/rejected": -1.3914148807525635, "logps/chosen": -264.681884765625, "logps/rejected": -338.7335510253906, "loss": 0.7262, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 7.292047023773193, "rewards/margins": 74.92854309082031, "rewards/rejected": -67.63648986816406, "step": 4490 }, { "epoch": 0.8832188420019627, "grad_norm": 112.68173700283117, "learning_rate": 2.0489971081290193e-08, "logits/chosen": -2.2475361824035645, "logits/rejected": -1.3851878643035889, "logps/chosen": -297.9952392578125, "logps/rejected": -300.0269470214844, "loss": 0.7455, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08291969448328018, "rewards/margins": 59.639869689941406, "rewards/rejected": -59.55695724487305, "step": 4500 }, { "epoch": 0.8851815505397449, "grad_norm": 161.38110760932548, "learning_rate": 1.9816191504724826e-08, "logits/chosen": -2.288626194000244, "logits/rejected": -1.3858747482299805, "logps/chosen": -221.8231964111328, "logps/rejected": -280.6064453125, "loss": 0.7119, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 4.42663049697876, "rewards/margins": 69.74522399902344, "rewards/rejected": -65.31858825683594, "step": 4510 }, { "epoch": 0.887144259077527, "grad_norm": 120.35203062049091, "learning_rate": 1.9153218654094498e-08, "logits/chosen": -2.4012207984924316, "logits/rejected": -1.6703811883926392, "logps/chosen": -269.9552307128906, "logps/rejected": -305.6309509277344, "loss": 0.6596, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 9.14642333984375, "rewards/margins": 72.54405212402344, "rewards/rejected": -63.39762496948242, "step": 4520 }, { "epoch": 0.8891069676153092, "grad_norm": 153.26100493931048, "learning_rate": 1.8501083654811206e-08, "logits/chosen": -1.9297215938568115, "logits/rejected": -1.855120062828064, "logps/chosen": -297.70208740234375, "logps/rejected": -330.72198486328125, "loss": 0.6354, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.4198187589645386, "rewards/margins": 68.16569519042969, "rewards/rejected": -69.58551788330078, "step": 4530 }, { "epoch": 0.8910696761530913, "grad_norm": 167.88777467721852, "learning_rate": 1.7859817123469068e-08, "logits/chosen": -1.7986654043197632, "logits/rejected": -1.3502906560897827, "logps/chosen": -209.4529266357422, "logps/rejected": -274.34869384765625, "loss": 0.7884, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -8.415654182434082, "rewards/margins": 36.35877227783203, "rewards/rejected": -44.77442169189453, "step": 4540 }, { "epoch": 0.8930323846908734, "grad_norm": 101.38589185656626, "learning_rate": 1.7229449166406477e-08, "logits/chosen": -2.164694309234619, "logits/rejected": -1.3587043285369873, "logps/chosen": -335.20159912109375, "logps/rejected": -337.04803466796875, "loss": 0.7179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.264878273010254, "rewards/margins": 69.10057067871094, "rewards/rejected": -74.36544799804688, "step": 4550 }, { "epoch": 0.8949950932286556, "grad_norm": 97.87590197623693, "learning_rate": 1.66100093782931e-08, "logits/chosen": -1.906011939048767, "logits/rejected": -1.2428722381591797, "logps/chosen": -259.1355895996094, "logps/rejected": -398.6679992675781, "loss": 0.5731, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 2.6651008129119873, "rewards/margins": 110.20516204833984, "rewards/rejected": -107.54007720947266, "step": 4560 }, { "epoch": 0.8969578017664377, "grad_norm": 156.32475785218165, "learning_rate": 1.600152684074005e-08, "logits/chosen": -2.2164413928985596, "logits/rejected": -1.48879075050354, "logps/chosen": -322.0838928222656, "logps/rejected": -374.07513427734375, "loss": 0.7341, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.236524224281311, "rewards/margins": 58.26641845703125, "rewards/rejected": -57.0298957824707, "step": 4570 }, { "epoch": 0.8989205103042198, "grad_norm": 83.09134902449158, "learning_rate": 1.540403012093483e-08, "logits/chosen": -2.400970220565796, "logits/rejected": -1.6487184762954712, "logps/chosen": -289.4566345214844, "logps/rejected": -271.651611328125, "loss": 0.6859, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 18.699176788330078, "rewards/margins": 51.11631774902344, "rewards/rejected": -32.41714096069336, "step": 4580 }, { "epoch": 0.900883218842002, "grad_norm": 93.95593155530334, "learning_rate": 1.4817547270300185e-08, "logits/chosen": -2.1774754524230957, "logits/rejected": -1.8916664123535156, "logps/chosen": -281.515380859375, "logps/rejected": -403.0048522949219, "loss": 0.81, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -13.691302299499512, "rewards/margins": 38.8759651184082, "rewards/rejected": -52.56726837158203, "step": 4590 }, { "epoch": 0.9028459273797841, "grad_norm": 108.18490118027587, "learning_rate": 1.4242105823176837e-08, "logits/chosen": -2.4013946056365967, "logits/rejected": -1.0448533296585083, "logps/chosen": -300.7716369628906, "logps/rejected": -332.5327453613281, "loss": 0.6767, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 19.133121490478516, "rewards/margins": 96.89375305175781, "rewards/rejected": -77.76062774658203, "step": 4600 }, { "epoch": 0.9048086359175662, "grad_norm": 114.66377551517579, "learning_rate": 1.3677732795531083e-08, "logits/chosen": -2.1420085430145264, "logits/rejected": -1.4790115356445312, "logps/chosen": -273.5298767089844, "logps/rejected": -380.37188720703125, "loss": 0.7513, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -9.378705024719238, "rewards/margins": 56.67119216918945, "rewards/rejected": -66.04989624023438, "step": 4610 }, { "epoch": 0.9067713444553483, "grad_norm": 114.52520437579405, "learning_rate": 1.3124454683686364e-08, "logits/chosen": -2.0067858695983887, "logits/rejected": -1.509436845779419, "logps/chosen": -266.0092468261719, "logps/rejected": -337.27252197265625, "loss": 0.7232, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -16.14985466003418, "rewards/margins": 57.49946975708008, "rewards/rejected": -73.64933013916016, "step": 4620 }, { "epoch": 0.9087340529931305, "grad_norm": 159.90748268071226, "learning_rate": 1.2582297463079288e-08, "logits/chosen": -2.209411144256592, "logits/rejected": -1.139925241470337, "logps/chosen": -240.65225219726562, "logps/rejected": -233.8152618408203, "loss": 0.7639, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -11.584493637084961, "rewards/margins": 67.28956604003906, "rewards/rejected": -78.87406921386719, "step": 4630 }, { "epoch": 0.9106967615309126, "grad_norm": 68.25430147889973, "learning_rate": 1.2051286587040049e-08, "logits/chosen": -2.238595962524414, "logits/rejected": -1.411048173904419, "logps/chosen": -272.5920715332031, "logps/rejected": -332.48388671875, "loss": 0.6988, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 7.771451473236084, "rewards/margins": 63.557891845703125, "rewards/rejected": -55.78643798828125, "step": 4640 }, { "epoch": 0.9126594700686947, "grad_norm": 94.88063470495919, "learning_rate": 1.1531446985597604e-08, "logits/chosen": -2.2080559730529785, "logits/rejected": -1.6409868001937866, "logps/chosen": -371.98809814453125, "logps/rejected": -342.58795166015625, "loss": 0.7394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2149124145507812, "rewards/margins": 59.49456787109375, "rewards/rejected": -56.27965545654297, "step": 4650 }, { "epoch": 0.914622178606477, "grad_norm": 79.35742254078137, "learning_rate": 1.1022803064309194e-08, "logits/chosen": -2.114619731903076, "logits/rejected": -1.5008548498153687, "logps/chosen": -321.5436096191406, "logps/rejected": -437.80670166015625, "loss": 0.7272, "rewards/accuracies": 0.533333420753479, "rewards/chosen": -3.9915764331817627, "rewards/margins": 52.97105026245117, "rewards/rejected": -56.962608337402344, "step": 4660 }, { "epoch": 0.9165848871442591, "grad_norm": 166.1428319000804, "learning_rate": 1.0525378703114401e-08, "logits/chosen": -2.5316622257232666, "logits/rejected": -1.7637078762054443, "logps/chosen": -212.4622802734375, "logps/rejected": -243.0324249267578, "loss": 0.792, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -1.5041046142578125, "rewards/margins": 51.37511444091797, "rewards/rejected": -52.87921905517578, "step": 4670 }, { "epoch": 0.9185475956820413, "grad_norm": 92.78333347509592, "learning_rate": 1.0039197255214238e-08, "logits/chosen": -2.081002712249756, "logits/rejected": -1.5838547945022583, "logps/chosen": -199.47564697265625, "logps/rejected": -303.1565856933594, "loss": 0.6867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.204691886901855, "rewards/margins": 60.508644104003906, "rewards/rejected": -71.71333312988281, "step": 4680 }, { "epoch": 0.9205103042198234, "grad_norm": 174.1861533035901, "learning_rate": 9.564281545974661e-09, "logits/chosen": -2.3713724613189697, "logits/rejected": -1.8105186223983765, "logps/chosen": -249.69351196289062, "logps/rejected": -319.3671569824219, "loss": 0.7624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 8.186200141906738, "rewards/margins": 77.68107604980469, "rewards/rejected": -69.494873046875, "step": 4690 }, { "epoch": 0.9224730127576055, "grad_norm": 73.88755971780854, "learning_rate": 9.100653871854963e-09, "logits/chosen": -2.3458399772644043, "logits/rejected": -2.128664493560791, "logps/chosen": -301.72882080078125, "logps/rejected": -326.0343933105469, "loss": 0.7304, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -5.737387657165527, "rewards/margins": 31.184558868408203, "rewards/rejected": -36.92194366455078, "step": 4700 }, { "epoch": 0.9244357212953876, "grad_norm": 99.44590172633377, "learning_rate": 8.648335999360934e-09, "logits/chosen": -2.166839599609375, "logits/rejected": -1.1509124040603638, "logps/chosen": -255.38076782226562, "logps/rejected": -271.965087890625, "loss": 0.662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.875688552856445, "rewards/margins": 68.0612564086914, "rewards/rejected": -62.1855583190918, "step": 4710 }, { "epoch": 0.9263984298331698, "grad_norm": 108.11894398140765, "learning_rate": 8.207349164023047e-09, "logits/chosen": -1.8029752969741821, "logits/rejected": -1.4828283786773682, "logps/chosen": -264.055419921875, "logps/rejected": -322.57318115234375, "loss": 0.7488, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.5839412808418274, "rewards/margins": 61.9434814453125, "rewards/rejected": -61.359527587890625, "step": 4720 }, { "epoch": 0.9283611383709519, "grad_norm": 179.60283946000771, "learning_rate": 7.777714069399532e-09, "logits/chosen": -2.312992572784424, "logits/rejected": -1.530349612236023, "logps/chosen": -285.323486328125, "logps/rejected": -307.5158996582031, "loss": 0.7645, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -12.045283317565918, "rewards/margins": 40.80082321166992, "rewards/rejected": -52.846107482910156, "step": 4730 }, { "epoch": 0.930323846908734, "grad_norm": 68.96994751650308, "learning_rate": 7.359450886104263e-09, "logits/chosen": -2.0951597690582275, "logits/rejected": -1.0962153673171997, "logps/chosen": -305.55694580078125, "logps/rejected": -352.0008850097656, "loss": 0.7332, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -7.826642036437988, "rewards/margins": 101.0040054321289, "rewards/rejected": -108.83065032958984, "step": 4740 }, { "epoch": 0.9322865554465162, "grad_norm": 144.83842208025314, "learning_rate": 6.9525792508597634e-09, "logits/chosen": -2.284339189529419, "logits/rejected": -1.9272350072860718, "logps/chosen": -287.0955810546875, "logps/rejected": -319.30023193359375, "loss": 0.6899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8660587072372437, "rewards/margins": 34.735557556152344, "rewards/rejected": -32.86949920654297, "step": 4750 }, { "epoch": 0.9342492639842983, "grad_norm": 272.7541379681586, "learning_rate": 6.557118265575451e-09, "logits/chosen": -2.038954496383667, "logits/rejected": -1.7726876735687256, "logps/chosen": -308.8658752441406, "logps/rejected": -330.85174560546875, "loss": 0.6407, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -3.6966445446014404, "rewards/margins": 51.185882568359375, "rewards/rejected": -54.882530212402344, "step": 4760 }, { "epoch": 0.9362119725220804, "grad_norm": 116.50844784548013, "learning_rate": 6.1730864964507636e-09, "logits/chosen": -2.4716408252716064, "logits/rejected": -1.640854835510254, "logps/chosen": -285.5713806152344, "logps/rejected": -278.3987121582031, "loss": 0.6211, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 11.198638916015625, "rewards/margins": 50.42829895019531, "rewards/rejected": -39.229652404785156, "step": 4770 }, { "epoch": 0.9381746810598626, "grad_norm": 121.61460098428927, "learning_rate": 5.8005019731033615e-09, "logits/chosen": -2.223710536956787, "logits/rejected": -1.2268856763839722, "logps/chosen": -276.4895935058594, "logps/rejected": -301.57769775390625, "loss": 0.7551, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 3.399601459503174, "rewards/margins": 81.63949584960938, "rewards/rejected": -78.2398910522461, "step": 4780 }, { "epoch": 0.9401373895976447, "grad_norm": 142.78343704075465, "learning_rate": 5.439382187722968e-09, "logits/chosen": -2.3372154235839844, "logits/rejected": -1.7055232524871826, "logps/chosen": -390.32952880859375, "logps/rejected": -355.1389465332031, "loss": 0.7467, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -6.962498664855957, "rewards/margins": 61.8531608581543, "rewards/rejected": -68.81565856933594, "step": 4790 }, { "epoch": 0.9421000981354269, "grad_norm": 96.66803975265674, "learning_rate": 5.089744094249837e-09, "logits/chosen": -2.591423511505127, "logits/rejected": -1.0603770017623901, "logps/chosen": -372.6617126464844, "logps/rejected": -356.703369140625, "loss": 0.6995, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -2.7440848350524902, "rewards/margins": 89.78263854980469, "rewards/rejected": -92.52672576904297, "step": 4800 }, { "epoch": 0.9440628066732091, "grad_norm": 189.9383639314406, "learning_rate": 4.751604107579077e-09, "logits/chosen": -2.498969793319702, "logits/rejected": -1.592284083366394, "logps/chosen": -272.3357238769531, "logps/rejected": -324.1437072753906, "loss": 0.6887, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 11.265649795532227, "rewards/margins": 88.8044662475586, "rewards/rejected": -77.538818359375, "step": 4810 }, { "epoch": 0.9460255152109912, "grad_norm": 150.51304748849157, "learning_rate": 4.424978102789661e-09, "logits/chosen": -2.3503851890563965, "logits/rejected": -1.231005311012268, "logps/chosen": -393.6268005371094, "logps/rejected": -324.7212219238281, "loss": 0.7197, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 12.89924430847168, "rewards/margins": 85.99956512451172, "rewards/rejected": -73.1003189086914, "step": 4820 }, { "epoch": 0.9479882237487733, "grad_norm": 83.74477305736671, "learning_rate": 4.109881414399524e-09, "logits/chosen": -2.4709324836730957, "logits/rejected": -1.6512641906738281, "logps/chosen": -274.58892822265625, "logps/rejected": -338.42388916015625, "loss": 0.7377, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 4.047939777374268, "rewards/margins": 61.45874786376953, "rewards/rejected": -57.41080856323242, "step": 4830 }, { "epoch": 0.9499509322865555, "grad_norm": 98.61723974808852, "learning_rate": 3.806328835645272e-09, "logits/chosen": -1.6199476718902588, "logits/rejected": -1.5394824743270874, "logps/chosen": -258.01824951171875, "logps/rejected": -296.22308349609375, "loss": 0.8038, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -16.628238677978516, "rewards/margins": 50.29301071166992, "rewards/rejected": -66.92124938964844, "step": 4840 }, { "epoch": 0.9519136408243376, "grad_norm": 85.36481837997191, "learning_rate": 3.5143346177878565e-09, "logits/chosen": -2.4662258625030518, "logits/rejected": -1.7831413745880127, "logps/chosen": -356.816650390625, "logps/rejected": -332.0841064453125, "loss": 0.7425, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 12.198331832885742, "rewards/margins": 74.44357299804688, "rewards/rejected": -62.2452392578125, "step": 4850 }, { "epoch": 0.9538763493621197, "grad_norm": 99.43481175006332, "learning_rate": 3.233912469443545e-09, "logits/chosen": -2.127804756164551, "logits/rejected": -0.847798228263855, "logps/chosen": -332.4073181152344, "logps/rejected": -301.81744384765625, "loss": 0.7967, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -4.643928050994873, "rewards/margins": 77.57685089111328, "rewards/rejected": -82.22077941894531, "step": 4860 }, { "epoch": 0.9558390578999019, "grad_norm": 95.16916217420084, "learning_rate": 2.9650755559401388e-09, "logits/chosen": -2.149732828140259, "logits/rejected": -1.253413438796997, "logps/chosen": -337.0418395996094, "logps/rejected": -381.5945739746094, "loss": 0.7683, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 9.873931884765625, "rewards/margins": 77.65193939208984, "rewards/rejected": -67.77800750732422, "step": 4870 }, { "epoch": 0.957801766437684, "grad_norm": 79.05891931903145, "learning_rate": 2.7078364986990175e-09, "logits/chosen": -2.0552120208740234, "logits/rejected": -1.3566513061523438, "logps/chosen": -404.00299072265625, "logps/rejected": -365.36309814453125, "loss": 0.7438, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -8.263569831848145, "rewards/margins": 47.22583770751953, "rewards/rejected": -55.489410400390625, "step": 4880 }, { "epoch": 0.9597644749754661, "grad_norm": 152.68280196677185, "learning_rate": 2.4622073746426165e-09, "logits/chosen": -2.2737069129943848, "logits/rejected": -1.6034488677978516, "logps/chosen": -268.2265625, "logps/rejected": -297.4512939453125, "loss": 0.7497, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 5.681783676147461, "rewards/margins": 77.997802734375, "rewards/rejected": -72.31602478027344, "step": 4890 }, { "epoch": 0.9617271835132483, "grad_norm": 98.1598179784665, "learning_rate": 2.2281997156273213e-09, "logits/chosen": -2.1924538612365723, "logits/rejected": -1.4625017642974854, "logps/chosen": -307.6187438964844, "logps/rejected": -304.46929931640625, "loss": 0.8161, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 4.7274041175842285, "rewards/margins": 50.50579071044922, "rewards/rejected": -45.77838897705078, "step": 4900 }, { "epoch": 0.9636898920510304, "grad_norm": 67.27432729984191, "learning_rate": 2.0058245079021265e-09, "logits/chosen": -2.378800868988037, "logits/rejected": -1.6980470418930054, "logps/chosen": -255.7166748046875, "logps/rejected": -255.8156280517578, "loss": 0.6538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.02839994430542, "rewards/margins": 57.828468322753906, "rewards/rejected": -53.80005645751953, "step": 4910 }, { "epoch": 0.9656526005888125, "grad_norm": 114.02972272098553, "learning_rate": 1.7950921915928784e-09, "logits/chosen": -1.9510562419891357, "logits/rejected": -1.1935961246490479, "logps/chosen": -268.18902587890625, "logps/rejected": -269.76934814453125, "loss": 0.6993, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8380398750305176, "rewards/margins": 56.02686309814453, "rewards/rejected": -58.864898681640625, "step": 4920 }, { "epoch": 0.9676153091265947, "grad_norm": 124.20213918508121, "learning_rate": 1.596012660212087e-09, "logits/chosen": -2.1742806434631348, "logits/rejected": -1.2920210361480713, "logps/chosen": -316.1720275878906, "logps/rejected": -283.9969177246094, "loss": 0.6952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5218753814697266, "rewards/margins": 66.9983139038086, "rewards/rejected": -64.47643280029297, "step": 4930 }, { "epoch": 0.9695780176643768, "grad_norm": 83.53872022433598, "learning_rate": 1.408595260194434e-09, "logits/chosen": -2.2443652153015137, "logits/rejected": -1.0468299388885498, "logps/chosen": -319.724853515625, "logps/rejected": -282.7727966308594, "loss": 0.7472, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -4.605959892272949, "rewards/margins": 77.07535552978516, "rewards/rejected": -81.68131256103516, "step": 4940 }, { "epoch": 0.971540726202159, "grad_norm": 150.15505604435086, "learning_rate": 1.2328487904580131e-09, "logits/chosen": -2.142683982849121, "logits/rejected": -1.4257400035858154, "logps/chosen": -235.9318084716797, "logps/rejected": -311.386474609375, "loss": 0.7477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.11069107055664, "rewards/margins": 69.9094009399414, "rewards/rejected": -92.02010345458984, "step": 4950 }, { "epoch": 0.9735034347399412, "grad_norm": 95.13212760959374, "learning_rate": 1.0687815019912173e-09, "logits/chosen": -2.0351216793060303, "logits/rejected": -1.3504893779754639, "logps/chosen": -274.8755798339844, "logps/rejected": -397.5858459472656, "loss": 0.5681, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 5.5987653732299805, "rewards/margins": 71.98918151855469, "rewards/rejected": -66.39041900634766, "step": 4960 }, { "epoch": 0.9754661432777233, "grad_norm": 84.67199380376164, "learning_rate": 9.164010974653802e-10, "logits/chosen": -2.398902177810669, "logits/rejected": -1.733372688293457, "logps/chosen": -252.38427734375, "logps/rejected": -327.160400390625, "loss": 0.658, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 15.55370044708252, "rewards/margins": 70.15989685058594, "rewards/rejected": -54.606201171875, "step": 4970 }, { "epoch": 0.9774288518155054, "grad_norm": 127.45417754170342, "learning_rate": 7.757147308731504e-10, "logits/chosen": -2.335721015930176, "logits/rejected": -1.4073684215545654, "logps/chosen": -303.7568664550781, "logps/rejected": -342.23590087890625, "loss": 0.7113, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 4.049750804901123, "rewards/margins": 77.05052185058594, "rewards/rejected": -73.00077819824219, "step": 4980 }, { "epoch": 0.9793915603532876, "grad_norm": 115.06685123760994, "learning_rate": 6.467290071925646e-10, "logits/chosen": -1.8174736499786377, "logits/rejected": -1.6374919414520264, "logps/chosen": -227.05233764648438, "logps/rejected": -278.3191223144531, "loss": 0.8276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -13.057368278503418, "rewards/margins": 43.30392074584961, "rewards/rejected": -56.36128616333008, "step": 4990 }, { "epoch": 0.9813542688910697, "grad_norm": 123.5747084203326, "learning_rate": 5.29449982077046e-10, "logits/chosen": -2.1923646926879883, "logits/rejected": -1.5888545513153076, "logps/chosen": -287.0592041015625, "logps/rejected": -281.2122497558594, "loss": 0.6539, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.531325101852417, "rewards/margins": 64.5676498413086, "rewards/rejected": -67.09896850585938, "step": 5000 }, { "epoch": 0.9833169774288518, "grad_norm": 108.27924016984268, "learning_rate": 4.2388316157104806e-10, "logits/chosen": -2.2981438636779785, "logits/rejected": -1.2112444639205933, "logps/chosen": -262.627685546875, "logps/rejected": -287.81927490234375, "loss": 0.6455, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 5.26472806930542, "rewards/margins": 71.73152160644531, "rewards/rejected": -66.46678161621094, "step": 5010 }, { "epoch": 0.985279685966634, "grad_norm": 89.46387418143505, "learning_rate": 3.300335018515676e-10, "logits/chosen": -2.403501033782959, "logits/rejected": -1.4914408922195435, "logps/chosen": -208.60482788085938, "logps/rejected": -222.968017578125, "loss": 0.7064, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 5.1732282638549805, "rewards/margins": 63.36035919189453, "rewards/rejected": -58.1871337890625, "step": 5020 }, { "epoch": 0.9872423945044161, "grad_norm": 124.87034244507268, "learning_rate": 2.4790540899546907e-10, "logits/chosen": -2.0438072681427, "logits/rejected": -1.5360976457595825, "logps/chosen": -229.02890014648438, "logps/rejected": -369.798583984375, "loss": 0.6945, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.978261470794678, "rewards/margins": 87.14382934570312, "rewards/rejected": -94.12208557128906, "step": 5030 }, { "epoch": 0.9892051030421982, "grad_norm": 130.5820464371203, "learning_rate": 1.7750273877262244e-10, "logits/chosen": -2.1564626693725586, "logits/rejected": -1.4923003911972046, "logps/chosen": -289.1947326660156, "logps/rejected": -321.1419372558594, "loss": 0.6874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0988696813583374, "rewards/margins": 58.76612091064453, "rewards/rejected": -59.864990234375, "step": 5040 }, { "epoch": 0.9911678115799804, "grad_norm": 70.01942114867353, "learning_rate": 1.1882879646485379e-10, "logits/chosen": -2.0114634037017822, "logits/rejected": -0.8014649152755737, "logps/chosen": -232.0609588623047, "logps/rejected": -308.7328186035156, "loss": 0.6061, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -6.858293056488037, "rewards/margins": 86.15089416503906, "rewards/rejected": -93.00920104980469, "step": 5050 }, { "epoch": 0.9931305201177625, "grad_norm": 110.40304088386422, "learning_rate": 7.188633671079136e-11, "logits/chosen": -2.2709622383117676, "logits/rejected": -1.3523612022399902, "logps/chosen": -283.89459228515625, "logps/rejected": -275.1901550292969, "loss": 0.7711, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 4.313065528869629, "rewards/margins": 69.71455383300781, "rewards/rejected": -65.40147399902344, "step": 5060 }, { "epoch": 0.9950932286555446, "grad_norm": 81.48865923488107, "learning_rate": 3.6677563376580344e-11, "logits/chosen": -2.363847255706787, "logits/rejected": -1.6017096042633057, "logps/chosen": -265.7796630859375, "logps/rejected": -377.3988037109375, "loss": 0.7834, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -0.959048867225647, "rewards/margins": 65.90235900878906, "rewards/rejected": -66.86140441894531, "step": 5070 }, { "epoch": 0.9970559371933267, "grad_norm": 95.32968344599277, "learning_rate": 1.3204129452354385e-11, "logits/chosen": -2.1981334686279297, "logits/rejected": -1.328355073928833, "logps/chosen": -276.4037170410156, "logps/rejected": -357.8365783691406, "loss": 0.6956, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -5.322502136230469, "rewards/margins": 67.26332092285156, "rewards/rejected": -72.58582305908203, "step": 5080 }, { "epoch": 0.9990186457311089, "grad_norm": 91.41982755349929, "learning_rate": 1.467136974631078e-12, "logits/chosen": -2.234492063522339, "logits/rejected": -1.3233827352523804, "logps/chosen": -272.5258483886719, "logps/rejected": -255.92886352539062, "loss": 0.5828, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -0.6036813855171204, "rewards/margins": 53.4892692565918, "rewards/rejected": -54.09294891357422, "step": 5090 }, { "epoch": 1.0, "step": 5095, "total_flos": 0.0, "train_loss": 0.7772327337929498, "train_runtime": 14337.9489, "train_samples_per_second": 4.264, "train_steps_per_second": 0.355 } ], "logging_steps": 10, "max_steps": 5095, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }