{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5095, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019627085377821394, "grad_norm": 55.42176668437576, "learning_rate": 9.803921568627451e-10, "logits/chosen": -2.9195547103881836, "logits/rejected": -2.4565553665161133, "logps/chosen": -421.782470703125, "logps/rejected": -89.33955383300781, "loss": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001962708537782139, "grad_norm": 52.13427578764769, "learning_rate": 9.803921568627451e-09, "logits/chosen": -2.5578858852386475, "logits/rejected": -2.5531322956085205, "logps/chosen": -328.5317687988281, "logps/rejected": -224.82073974609375, "loss": 0.0, "rewards/accuracies": 0.37037035822868347, "rewards/chosen": -0.02090207114815712, "rewards/margins": -0.03216912969946861, "rewards/rejected": 0.011267063207924366, "step": 10 }, { "epoch": 0.003925417075564278, "grad_norm": 54.72055344030846, "learning_rate": 1.9607843137254902e-08, "logits/chosen": -2.7481472492218018, "logits/rejected": -2.649394989013672, "logps/chosen": -241.4879608154297, "logps/rejected": -228.73208618164062, "loss": -0.0, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 0.08823471516370773, "rewards/margins": -0.021855510771274567, "rewards/rejected": 0.1100902333855629, "step": 20 }, { "epoch": 0.005888125613346418, "grad_norm": 45.2682166869957, "learning_rate": 2.941176470588235e-08, "logits/chosen": -2.806262493133545, "logits/rejected": -2.7496838569641113, "logps/chosen": -271.72894287109375, "logps/rejected": -277.15478515625, "loss": 0.0007, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 0.004963749088346958, "rewards/margins": -0.026950735598802567, "rewards/rejected": 0.031914494931697845, "step": 30 }, { "epoch": 0.007850834151128557, "grad_norm": 53.82565047074732, "learning_rate": 3.9215686274509804e-08, "logits/chosen": -2.5292630195617676, "logits/rejected": -2.6139206886291504, "logps/chosen": -236.1393585205078, "logps/rejected": -199.0591583251953, "loss": -0.0004, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 0.02007109485566616, "rewards/margins": -0.018103918060660362, "rewards/rejected": 0.038175009191036224, "step": 40 }, { "epoch": 0.009813542688910697, "grad_norm": 49.03057543008255, "learning_rate": 4.901960784313725e-08, "logits/chosen": -2.7650465965270996, "logits/rejected": -2.7305190563201904, "logps/chosen": -265.3066711425781, "logps/rejected": -283.49688720703125, "loss": 0.0002, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 0.114730104804039, "rewards/margins": 0.01123037189245224, "rewards/rejected": 0.10349973291158676, "step": 50 }, { "epoch": 0.011776251226692836, "grad_norm": 46.617995449740285, "learning_rate": 5.88235294117647e-08, "logits/chosen": -2.756399154663086, "logits/rejected": -2.6721854209899902, "logps/chosen": -254.83731079101562, "logps/rejected": -236.0925750732422, "loss": 0.0001, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 0.056176573038101196, "rewards/margins": 0.018731657415628433, "rewards/rejected": 0.037444911897182465, "step": 60 }, { "epoch": 0.013738959764474975, "grad_norm": 48.30068604915362, "learning_rate": 6.862745098039216e-08, "logits/chosen": -2.8364510536193848, "logits/rejected": -2.7723422050476074, "logps/chosen": -301.88037109375, "logps/rejected": -237.03024291992188, "loss": -0.0006, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.22586870193481445, "rewards/margins": -0.024675820022821426, "rewards/rejected": 0.2505444884300232, "step": 70 }, { "epoch": 0.015701668302257114, "grad_norm": 56.77370393350773, "learning_rate": 7.843137254901961e-08, "logits/chosen": -2.7583060264587402, "logits/rejected": -2.558380365371704, "logps/chosen": -324.0039367675781, "logps/rejected": -213.39865112304688, "loss": -0.0012, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 0.27130740880966187, "rewards/margins": 0.2214820384979248, "rewards/rejected": 0.049825381487607956, "step": 80 }, { "epoch": 0.017664376840039256, "grad_norm": 56.40949159712068, "learning_rate": 8.823529411764706e-08, "logits/chosen": -2.820873737335205, "logits/rejected": -2.7982966899871826, "logps/chosen": -269.5506896972656, "logps/rejected": -272.46112060546875, "loss": -0.0023, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.6012493371963501, "rewards/margins": 0.16042634844779968, "rewards/rejected": 0.4408229887485504, "step": 90 }, { "epoch": 0.019627085377821395, "grad_norm": 47.26297740382754, "learning_rate": 9.80392156862745e-08, "logits/chosen": -2.7922067642211914, "logits/rejected": -2.697592258453369, "logps/chosen": -295.4319152832031, "logps/rejected": -259.5749206542969, "loss": -0.0027, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.5905193090438843, "rewards/margins": 0.2526652216911316, "rewards/rejected": 0.3378540575504303, "step": 100 }, { "epoch": 0.021589793915603533, "grad_norm": 55.31106672287658, "learning_rate": 1.0784313725490195e-07, "logits/chosen": -2.796839475631714, "logits/rejected": -2.708120346069336, "logps/chosen": -331.6488342285156, "logps/rejected": -292.35662841796875, "loss": -0.0031, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0887248516082764, "rewards/margins": 0.5024398565292358, "rewards/rejected": 0.5862849950790405, "step": 110 }, { "epoch": 0.023552502453385672, "grad_norm": 42.53164026045271, "learning_rate": 1.176470588235294e-07, "logits/chosen": -2.6476073265075684, "logits/rejected": -2.551424503326416, "logps/chosen": -202.0703125, "logps/rejected": -188.40463256835938, "loss": -0.0066, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.8433626294136047, "rewards/margins": 0.8148363828659058, "rewards/rejected": 0.028526177629828453, "step": 120 }, { "epoch": 0.02551521099116781, "grad_norm": 49.94411838179539, "learning_rate": 1.2745098039215685e-07, "logits/chosen": -2.5919604301452637, "logits/rejected": -2.6266636848449707, "logps/chosen": -359.3283386230469, "logps/rejected": -299.9968566894531, "loss": -0.0085, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 1.8167911767959595, "rewards/margins": 0.9724923372268677, "rewards/rejected": 0.8442989587783813, "step": 130 }, { "epoch": 0.02747791952894995, "grad_norm": 69.33020557085253, "learning_rate": 1.3725490196078432e-07, "logits/chosen": -2.6862921714782715, "logits/rejected": -2.717078924179077, "logps/chosen": -192.0702667236328, "logps/rejected": -193.63720703125, "loss": -0.0118, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.122182846069336, "rewards/margins": 1.1919901371002197, "rewards/rejected": -0.06980731338262558, "step": 140 }, { "epoch": 0.029440628066732092, "grad_norm": 54.7726336269782, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -2.8408827781677246, "logits/rejected": -2.7343363761901855, "logps/chosen": -233.08447265625, "logps/rejected": -224.98486328125, "loss": -0.0127, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 1.0583146810531616, "rewards/margins": 0.6715062856674194, "rewards/rejected": 0.38680845499038696, "step": 150 }, { "epoch": 0.03140333660451423, "grad_norm": 64.64847493045461, "learning_rate": 1.5686274509803921e-07, "logits/chosen": -2.778686761856079, "logits/rejected": -2.6812801361083984, "logps/chosen": -277.0428466796875, "logps/rejected": -219.3829803466797, "loss": -0.0299, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 1.49662446975708, "rewards/margins": 2.666884660720825, "rewards/rejected": -1.1702601909637451, "step": 160 }, { "epoch": 0.033366045142296366, "grad_norm": 56.673663390323526, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -2.6702511310577393, "logits/rejected": -2.645601272583008, "logps/chosen": -248.2321014404297, "logps/rejected": -208.69473266601562, "loss": -0.0275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1382129192352295, "rewards/margins": 2.059967279434204, "rewards/rejected": -2.1981797218322754, "step": 170 }, { "epoch": 0.03532875368007851, "grad_norm": 39.98296588741384, "learning_rate": 1.764705882352941e-07, "logits/chosen": -2.751546621322632, "logits/rejected": -2.6485252380371094, "logps/chosen": -259.86846923828125, "logps/rejected": -251.5321044921875, "loss": -0.0243, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -1.0277085304260254, "rewards/margins": 3.4545974731445312, "rewards/rejected": -4.48230504989624, "step": 180 }, { "epoch": 0.03729146221786065, "grad_norm": 60.40011532777635, "learning_rate": 1.8627450980392158e-07, "logits/chosen": -2.6494269371032715, "logits/rejected": -2.5910208225250244, "logps/chosen": -318.39764404296875, "logps/rejected": -260.9273376464844, "loss": -0.026, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.21862542629241943, "rewards/margins": 3.9593727588653564, "rewards/rejected": -4.1779985427856445, "step": 190 }, { "epoch": 0.03925417075564279, "grad_norm": 70.42455751936757, "learning_rate": 1.96078431372549e-07, "logits/chosen": -2.767629623413086, "logits/rejected": -2.573652744293213, "logps/chosen": -286.00787353515625, "logps/rejected": -202.26034545898438, "loss": -0.0439, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.8227108120918274, "rewards/margins": 6.793312072753906, "rewards/rejected": -5.9706010818481445, "step": 200 }, { "epoch": 0.04121687929342493, "grad_norm": 56.45488111494288, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -2.5618834495544434, "logits/rejected": -2.4814367294311523, "logps/chosen": -245.49246215820312, "logps/rejected": -242.6617431640625, "loss": -0.0677, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -3.1054611206054688, "rewards/margins": 4.085428714752197, "rewards/rejected": -7.19088888168335, "step": 210 }, { "epoch": 0.04317958783120707, "grad_norm": 57.21351703174619, "learning_rate": 2.156862745098039e-07, "logits/chosen": -2.731902837753296, "logits/rejected": -2.685002088546753, "logps/chosen": -292.84552001953125, "logps/rejected": -297.1029357910156, "loss": -0.0697, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -4.646189212799072, "rewards/margins": 6.997344970703125, "rewards/rejected": -11.643535614013672, "step": 220 }, { "epoch": 0.045142296368989206, "grad_norm": 52.528417372744045, "learning_rate": 2.2549019607843137e-07, "logits/chosen": -2.7624478340148926, "logits/rejected": -2.628632068634033, "logps/chosen": -273.2393493652344, "logps/rejected": -216.01034545898438, "loss": -0.0825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.331894874572754, "rewards/margins": 7.152884006500244, "rewards/rejected": -13.484777450561523, "step": 230 }, { "epoch": 0.047105004906771344, "grad_norm": 64.61244327489912, "learning_rate": 2.352941176470588e-07, "logits/chosen": -2.666316032409668, "logits/rejected": -2.649465799331665, "logps/chosen": -253.55532836914062, "logps/rejected": -267.13409423828125, "loss": -0.0949, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -3.607656478881836, "rewards/margins": 16.110401153564453, "rewards/rejected": -19.718055725097656, "step": 240 }, { "epoch": 0.04906771344455348, "grad_norm": 76.24175017198084, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -2.6245503425598145, "logits/rejected": -2.536576509475708, "logps/chosen": -249.9552001953125, "logps/rejected": -221.0200958251953, "loss": -0.1128, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10.527206420898438, "rewards/margins": 7.4732489585876465, "rewards/rejected": -18.00045394897461, "step": 250 }, { "epoch": 0.05103042198233562, "grad_norm": 109.36007939895549, "learning_rate": 2.549019607843137e-07, "logits/chosen": -2.7169508934020996, "logits/rejected": -2.6889023780822754, "logps/chosen": -324.5906066894531, "logps/rejected": -271.66864013671875, "loss": -0.1297, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -6.839819431304932, "rewards/margins": 15.588996887207031, "rewards/rejected": -22.428817749023438, "step": 260 }, { "epoch": 0.05299313052011776, "grad_norm": 73.6401928280892, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -2.634995222091675, "logits/rejected": -2.5977635383605957, "logps/chosen": -227.48666381835938, "logps/rejected": -238.53683471679688, "loss": -0.0796, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.1082758903503418, "rewards/margins": 15.6939697265625, "rewards/rejected": -15.58569049835205, "step": 270 }, { "epoch": 0.0549558390578999, "grad_norm": 77.8054698576488, "learning_rate": 2.7450980392156863e-07, "logits/chosen": -2.6497695446014404, "logits/rejected": -2.5452229976654053, "logps/chosen": -245.05020141601562, "logps/rejected": -227.035400390625, "loss": -0.1983, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 0.3784346580505371, "rewards/margins": 25.546825408935547, "rewards/rejected": -25.168392181396484, "step": 280 }, { "epoch": 0.05691854759568204, "grad_norm": 69.38700770802893, "learning_rate": 2.8431372549019607e-07, "logits/chosen": -2.7868704795837402, "logits/rejected": -2.6382501125335693, "logps/chosen": -308.83795166015625, "logps/rejected": -262.008544921875, "loss": -0.2034, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4.117677688598633, "rewards/margins": 26.686620712280273, "rewards/rejected": -30.80430030822754, "step": 290 }, { "epoch": 0.058881256133464184, "grad_norm": 232.07899924814748, "learning_rate": 2.941176470588235e-07, "logits/chosen": -2.6749207973480225, "logits/rejected": -2.6329691410064697, "logps/chosen": -277.1617431640625, "logps/rejected": -311.02001953125, "loss": -0.156, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 6.130270004272461, "rewards/margins": 16.96583366394043, "rewards/rejected": -10.835563659667969, "step": 300 }, { "epoch": 0.06084396467124632, "grad_norm": 85.89483352711162, "learning_rate": 3.0392156862745094e-07, "logits/chosen": -2.6615962982177734, "logits/rejected": -2.5684642791748047, "logps/chosen": -311.0503234863281, "logps/rejected": -257.279052734375, "loss": -0.2022, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8.99604606628418, "rewards/margins": 11.814870834350586, "rewards/rejected": -20.810916900634766, "step": 310 }, { "epoch": 0.06280667320902845, "grad_norm": 67.59826415226972, "learning_rate": 3.1372549019607843e-07, "logits/chosen": -2.637139320373535, "logits/rejected": -2.650726318359375, "logps/chosen": -203.72000122070312, "logps/rejected": -229.19052124023438, "loss": -0.1158, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03306236118078232, "rewards/margins": 4.578888893127441, "rewards/rejected": -4.61195182800293, "step": 320 }, { "epoch": 0.0647693817468106, "grad_norm": 127.93824361584238, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -2.7270684242248535, "logits/rejected": -2.676504611968994, "logps/chosen": -280.10504150390625, "logps/rejected": -246.6905059814453, "loss": -0.2429, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -4.071059226989746, "rewards/margins": 28.99440574645996, "rewards/rejected": -33.06546401977539, "step": 330 }, { "epoch": 0.06673209028459273, "grad_norm": 354.12649330343476, "learning_rate": 3.333333333333333e-07, "logits/chosen": -2.80649471282959, "logits/rejected": -2.648033380508423, "logps/chosen": -385.0194396972656, "logps/rejected": -306.19293212890625, "loss": -0.2397, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -34.79177474975586, "rewards/margins": 30.58514404296875, "rewards/rejected": -65.37691497802734, "step": 340 }, { "epoch": 0.06869479882237488, "grad_norm": 72.34501740389159, "learning_rate": 3.431372549019608e-07, "logits/chosen": -2.7408480644226074, "logits/rejected": -2.654379367828369, "logps/chosen": -226.4215087890625, "logps/rejected": -210.12991333007812, "loss": -0.1632, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -35.80000305175781, "rewards/margins": -2.9782471656799316, "rewards/rejected": -32.821754455566406, "step": 350 }, { "epoch": 0.07065750736015702, "grad_norm": 225.547386760939, "learning_rate": 3.529411764705882e-07, "logits/chosen": -2.7857296466827393, "logits/rejected": -2.592670202255249, "logps/chosen": -365.99395751953125, "logps/rejected": -315.8221435546875, "loss": -0.3779, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -15.632776260375977, "rewards/margins": 44.1047248840332, "rewards/rejected": -59.73749923706055, "step": 360 }, { "epoch": 0.07262021589793916, "grad_norm": 96.03708907027585, "learning_rate": 3.6274509803921566e-07, "logits/chosen": -2.7570395469665527, "logits/rejected": -2.6691067218780518, "logps/chosen": -291.181396484375, "logps/rejected": -317.2411193847656, "loss": -0.2726, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -17.516420364379883, "rewards/margins": 36.926048278808594, "rewards/rejected": -54.44246292114258, "step": 370 }, { "epoch": 0.0745829244357213, "grad_norm": 328.56064746254384, "learning_rate": 3.7254901960784315e-07, "logits/chosen": -2.5822367668151855, "logits/rejected": -2.717496395111084, "logps/chosen": -260.66680908203125, "logps/rejected": -355.0137023925781, "loss": -0.3196, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -20.576526641845703, "rewards/margins": 41.59932327270508, "rewards/rejected": -62.17584228515625, "step": 380 }, { "epoch": 0.07654563297350343, "grad_norm": 100.81642864977084, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -2.613299608230591, "logits/rejected": -2.4213340282440186, "logps/chosen": -307.2693176269531, "logps/rejected": -363.9527893066406, "loss": -0.374, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -40.563045501708984, "rewards/margins": 44.697811126708984, "rewards/rejected": -85.26085662841797, "step": 390 }, { "epoch": 0.07850834151128558, "grad_norm": 85.17053705717989, "learning_rate": 3.92156862745098e-07, "logits/chosen": -2.7737975120544434, "logits/rejected": -2.6642842292785645, "logps/chosen": -294.2020568847656, "logps/rejected": -311.3301696777344, "loss": -0.1607, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -29.82242774963379, "rewards/margins": 5.755614280700684, "rewards/rejected": -35.578041076660156, "step": 400 }, { "epoch": 0.08047105004906771, "grad_norm": 84.07689139888112, "learning_rate": 4.019607843137255e-07, "logits/chosen": -2.789400815963745, "logits/rejected": -2.783902645111084, "logps/chosen": -341.98602294921875, "logps/rejected": -332.31890869140625, "loss": -0.3788, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -40.6246337890625, "rewards/margins": 38.11586380004883, "rewards/rejected": -78.74049377441406, "step": 410 }, { "epoch": 0.08243375858684986, "grad_norm": 111.45122162909242, "learning_rate": 4.117647058823529e-07, "logits/chosen": -2.8530049324035645, "logits/rejected": -2.819136142730713, "logps/chosen": -314.96405029296875, "logps/rejected": -415.540283203125, "loss": -0.5942, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -54.3344612121582, "rewards/margins": 84.99993133544922, "rewards/rejected": -139.33438110351562, "step": 420 }, { "epoch": 0.08439646712463199, "grad_norm": 207.65454710276742, "learning_rate": 4.215686274509804e-07, "logits/chosen": -2.9525511264801025, "logits/rejected": -2.7926957607269287, "logps/chosen": -481.3441467285156, "logps/rejected": -369.22003173828125, "loss": -0.372, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -117.11238098144531, "rewards/margins": 11.82046890258789, "rewards/rejected": -128.932861328125, "step": 430 }, { "epoch": 0.08635917566241413, "grad_norm": 154.10223690580898, "learning_rate": 4.313725490196078e-07, "logits/chosen": -2.703326940536499, "logits/rejected": -2.6534676551818848, "logps/chosen": -405.1926574707031, "logps/rejected": -424.9640197753906, "loss": -0.6634, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -105.73802185058594, "rewards/margins": 76.21388244628906, "rewards/rejected": -181.95187377929688, "step": 440 }, { "epoch": 0.08832188420019627, "grad_norm": 384.0225078030504, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -2.9043920040130615, "logits/rejected": -2.965247869491577, "logps/chosen": -438.9114685058594, "logps/rejected": -720.2880249023438, "loss": -1.5813, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -157.97433471679688, "rewards/margins": 272.9202880859375, "rewards/rejected": -430.8946228027344, "step": 450 }, { "epoch": 0.09028459273797841, "grad_norm": 1940.9871743180609, "learning_rate": 4.5098039215686274e-07, "logits/chosen": -2.4680662155151367, "logits/rejected": -2.2228832244873047, "logps/chosen": -1756.931396484375, "logps/rejected": -1876.9644775390625, "loss": -6.8257, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -1507.594970703125, "rewards/margins": 111.273193359375, "rewards/rejected": -1618.8682861328125, "step": 460 }, { "epoch": 0.09224730127576054, "grad_norm": 1533.8722690452116, "learning_rate": 4.6078431372549013e-07, "logits/chosen": -2.1377968788146973, "logits/rejected": -1.9869731664657593, "logps/chosen": -5085.20751953125, "logps/rejected": -6400.43359375, "loss": -11.9304, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -4821.6015625, "rewards/margins": 1242.607666015625, "rewards/rejected": -6064.20849609375, "step": 470 }, { "epoch": 0.09421000981354269, "grad_norm": 6741.031275288143, "learning_rate": 4.705882352941176e-07, "logits/chosen": -2.7642159461975098, "logits/rejected": -2.6780083179473877, "logps/chosen": -4329.5908203125, "logps/rejected": -4531.9853515625, "loss": 2.0111, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -4009.44873046875, "rewards/margins": 199.69712829589844, "rewards/rejected": -4209.14599609375, "step": 480 }, { "epoch": 0.09617271835132483, "grad_norm": 1796.8019695955134, "learning_rate": 4.803921568627451e-07, "logits/chosen": -2.7348339557647705, "logits/rejected": -2.6679794788360596, "logps/chosen": -2115.27392578125, "logps/rejected": -3323.93994140625, "loss": -4.5983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1820.602294921875, "rewards/margins": 1181.430908203125, "rewards/rejected": -3002.03271484375, "step": 490 }, { "epoch": 0.09813542688910697, "grad_norm": 3353.4593094629063, "learning_rate": 4.901960784313725e-07, "logits/chosen": -3.0273239612579346, "logits/rejected": -3.043909788131714, "logps/chosen": -2941.096923828125, "logps/rejected": -3184.89794921875, "loss": -12.0357, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2641.497802734375, "rewards/margins": 292.752197265625, "rewards/rejected": -2934.25048828125, "step": 500 }, { "epoch": 0.10009813542688911, "grad_norm": 1772.5245479851885, "learning_rate": 5e-07, "logits/chosen": -2.921410322189331, "logits/rejected": -2.9772872924804688, "logps/chosen": -6094.3212890625, "logps/rejected": -5644.60986328125, "loss": 1.0821, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": -5806.25927734375, "rewards/margins": -434.4007873535156, "rewards/rejected": -5371.8583984375, "step": 510 }, { "epoch": 0.10206084396467124, "grad_norm": 3125.6709548857507, "learning_rate": 4.999941314693213e-07, "logits/chosen": -3.0188825130462646, "logits/rejected": -2.9688477516174316, "logps/chosen": -6226.7451171875, "logps/rejected": -6661.93115234375, "loss": -11.1655, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -5977.10205078125, "rewards/margins": 478.0450134277344, "rewards/rejected": -6455.1474609375, "step": 520 }, { "epoch": 0.10402355250245339, "grad_norm": 5180.5997502065375, "learning_rate": 4.999765261528027e-07, "logits/chosen": -3.2866196632385254, "logits/rejected": -3.1494357585906982, "logps/chosen": -5611.6826171875, "logps/rejected": -7808.75244140625, "loss": -11.7137, "rewards/accuracies": 0.5, "rewards/chosen": -5322.3271484375, "rewards/margins": 2170.20751953125, "rewards/rejected": -7492.5341796875, "step": 530 }, { "epoch": 0.10598626104023552, "grad_norm": 3752.1380381960794, "learning_rate": 4.999471848769828e-07, "logits/chosen": -3.181457281112671, "logits/rejected": -3.2540550231933594, "logps/chosen": -5376.814453125, "logps/rejected": -6871.60693359375, "loss": -11.5894, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -5086.708984375, "rewards/margins": 1471.297119140625, "rewards/rejected": -6558.00634765625, "step": 540 }, { "epoch": 0.10794896957801767, "grad_norm": 7856.385644107133, "learning_rate": 4.999061090193831e-07, "logits/chosen": -3.4750685691833496, "logits/rejected": -3.532654285430908, "logps/chosen": -6213.30859375, "logps/rejected": -5363.4853515625, "loss": -12.1155, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": -5890.5, "rewards/margins": -828.7430419921875, "rewards/rejected": -5061.7578125, "step": 550 }, { "epoch": 0.1099116781157998, "grad_norm": 3248.3254891752, "learning_rate": 4.998533005084428e-07, "logits/chosen": -3.53291392326355, "logits/rejected": -3.5924575328826904, "logps/chosen": -3382.26123046875, "logps/rejected": -4808.04736328125, "loss": -8.4086, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3100.73583984375, "rewards/margins": 1455.527587890625, "rewards/rejected": -4556.26416015625, "step": 560 }, { "epoch": 0.11187438665358194, "grad_norm": 5469.8848454301, "learning_rate": 4.997887618234292e-07, "logits/chosen": -3.650221586227417, "logits/rejected": -3.711533784866333, "logps/chosen": -4809.1513671875, "logps/rejected": -6255.3798828125, "loss": -31.673, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -4527.5029296875, "rewards/margins": 1419.117431640625, "rewards/rejected": -5946.61962890625, "step": 570 }, { "epoch": 0.11383709519136408, "grad_norm": 6564.9062573568335, "learning_rate": 4.997124959943201e-07, "logits/chosen": -3.3947842121124268, "logits/rejected": -3.3778738975524902, "logps/chosen": -13080.7685546875, "logps/rejected": -9107.861328125, "loss": 8.8239, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -12836.861328125, "rewards/margins": -3942.26708984375, "rewards/rejected": -8894.59375, "step": 580 }, { "epoch": 0.11579980372914622, "grad_norm": 5422.079695255457, "learning_rate": 4.996245066016623e-07, "logits/chosen": -3.4116103649139404, "logits/rejected": -3.3361339569091797, "logps/chosen": -10116.0234375, "logps/rejected": -11018.990234375, "loss": -4.731, "rewards/accuracies": 0.5666666030883789, "rewards/chosen": -9860.8955078125, "rewards/margins": 926.3974609375, "rewards/rejected": -10787.2939453125, "step": 590 }, { "epoch": 0.11776251226692837, "grad_norm": 3621.303673581134, "learning_rate": 4.995247977764035e-07, "logits/chosen": -3.5449604988098145, "logits/rejected": -3.407780170440674, "logps/chosen": -3392.916748046875, "logps/rejected": -6572.7216796875, "loss": -13.0392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3158.355712890625, "rewards/margins": 3204.546875, "rewards/rejected": -6362.90283203125, "step": 600 }, { "epoch": 0.1197252208047105, "grad_norm": 2004.7364542593111, "learning_rate": 4.994133741996982e-07, "logits/chosen": -3.6601264476776123, "logits/rejected": -3.7173964977264404, "logps/chosen": -1705.1253662109375, "logps/rejected": -2238.16259765625, "loss": -5.1537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1423.309814453125, "rewards/margins": 575.7150268554688, "rewards/rejected": -1999.0247802734375, "step": 610 }, { "epoch": 0.12168792934249265, "grad_norm": 4218.172795174711, "learning_rate": 4.992902411026877e-07, "logits/chosen": -3.606293201446533, "logits/rejected": -3.561081647872925, "logps/chosen": -4419.13916015625, "logps/rejected": -4373.31884765625, "loss": -9.3513, "rewards/accuracies": 0.6333332657814026, "rewards/chosen": -4153.29638671875, "rewards/margins": -122.78157043457031, "rewards/rejected": -4030.514892578125, "step": 620 }, { "epoch": 0.12365063788027478, "grad_norm": 5936.340493561604, "learning_rate": 4.991554042662548e-07, "logits/chosen": -3.846891403198242, "logits/rejected": -3.78739595413208, "logps/chosen": -4390.55322265625, "logps/rejected": -5842.3115234375, "loss": -5.706, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4141.6005859375, "rewards/margins": 1460.165771484375, "rewards/rejected": -5601.76611328125, "step": 630 }, { "epoch": 0.1256133464180569, "grad_norm": 7251.735889606725, "learning_rate": 4.990088700207525e-07, "logits/chosen": -4.1332526206970215, "logits/rejected": -3.9190673828125, "logps/chosen": -3458.100830078125, "logps/rejected": -6369.9404296875, "loss": -14.8476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3248.2763671875, "rewards/margins": 2881.4208984375, "rewards/rejected": -6129.69775390625, "step": 640 }, { "epoch": 0.12757605495583907, "grad_norm": 3507.3147161054585, "learning_rate": 4.988506452457066e-07, "logits/chosen": -4.082102298736572, "logits/rejected": -4.06673526763916, "logps/chosen": -2457.28564453125, "logps/rejected": -3667.985595703125, "loss": -8.2134, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -2160.65087890625, "rewards/margins": 1211.0816650390625, "rewards/rejected": -3371.732421875, "step": 650 }, { "epoch": 0.1295387634936212, "grad_norm": 3988.999058787467, "learning_rate": 4.986807373694925e-07, "logits/chosen": -4.032341480255127, "logits/rejected": -4.118239879608154, "logps/chosen": -3139.01123046875, "logps/rejected": -3685.5859375, "loss": -12.1803, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -2875.022216796875, "rewards/margins": 542.792236328125, "rewards/rejected": -3417.814453125, "step": 660 }, { "epoch": 0.13150147203140333, "grad_norm": 8771.176753932707, "learning_rate": 4.984991543689869e-07, "logits/chosen": -4.0565972328186035, "logits/rejected": -4.150533199310303, "logps/chosen": -4282.26416015625, "logps/rejected": -5035.6845703125, "loss": -14.2264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3998.866455078125, "rewards/margins": 756.2091064453125, "rewards/rejected": -4755.0751953125, "step": 670 }, { "epoch": 0.13346418056918546, "grad_norm": 2610.48599165418, "learning_rate": 4.983059047691931e-07, "logits/chosen": -4.347467422485352, "logits/rejected": -4.5054612159729, "logps/chosen": -3816.40625, "logps/rejected": -3973.56103515625, "loss": -19.0198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3557.932861328125, "rewards/margins": 201.40988159179688, "rewards/rejected": -3759.342529296875, "step": 680 }, { "epoch": 0.13542688910696762, "grad_norm": 10381.93552014944, "learning_rate": 4.981009976428408e-07, "logits/chosen": -4.395026206970215, "logits/rejected": -4.382890224456787, "logps/chosen": -3988.653076171875, "logps/rejected": -5918.6103515625, "loss": -34.8657, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -3677.35107421875, "rewards/margins": 1974.8812255859375, "rewards/rejected": -5652.23291015625, "step": 690 }, { "epoch": 0.13738959764474976, "grad_norm": 14657.196242480899, "learning_rate": 4.9788444260996e-07, "logits/chosen": -4.675393581390381, "logits/rejected": -4.638659477233887, "logps/chosen": -4381.5712890625, "logps/rejected": -10704.0107421875, "loss": -32.5017, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4117.3544921875, "rewards/margins": 6328.7890625, "rewards/rejected": -10446.14453125, "step": 700 }, { "epoch": 0.1393523061825319, "grad_norm": 6306.842404336326, "learning_rate": 4.976562498374295e-07, "logits/chosen": -4.886357307434082, "logits/rejected": -4.988670349121094, "logps/chosen": -5579.8671875, "logps/rejected": -7533.5029296875, "loss": -23.9449, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -5290.119140625, "rewards/margins": 1984.4984130859375, "rewards/rejected": -7274.6171875, "step": 710 }, { "epoch": 0.14131501472031405, "grad_norm": 16796.642362375413, "learning_rate": 4.974164300384997e-07, "logits/chosen": -4.664653778076172, "logits/rejected": -4.7305378913879395, "logps/chosen": -3581.632080078125, "logps/rejected": -7852.50634765625, "loss": -21.2661, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -3350.428955078125, "rewards/margins": 4208.6533203125, "rewards/rejected": -7559.0810546875, "step": 720 }, { "epoch": 0.14327772325809618, "grad_norm": 3315.428620654237, "learning_rate": 4.971649944722893e-07, "logits/chosen": -4.925482749938965, "logits/rejected": -4.885300636291504, "logps/chosen": -2285.36181640625, "logps/rejected": -4620.7470703125, "loss": -18.0952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2032.9224853515625, "rewards/margins": 2304.1201171875, "rewards/rejected": -4337.04296875, "step": 730 }, { "epoch": 0.1452404317958783, "grad_norm": 9811.000469729088, "learning_rate": 4.96901954943257e-07, "logits/chosen": -4.881524562835693, "logits/rejected": -5.207891464233398, "logps/chosen": -6457.2255859375, "logps/rejected": -6290.47314453125, "loss": -14.9692, "rewards/accuracies": 0.5, "rewards/chosen": -6191.6083984375, "rewards/margins": -67.74050903320312, "rewards/rejected": -6123.8681640625, "step": 740 }, { "epoch": 0.14720314033366044, "grad_norm": 13363.58172746426, "learning_rate": 4.96627323800647e-07, "logits/chosen": -5.046450138092041, "logits/rejected": -5.161167144775391, "logps/chosen": -6698.05224609375, "logps/rejected": -6591.6865234375, "loss": -24.6609, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -6460.36328125, "rewards/margins": -113.03096008300781, "rewards/rejected": -6347.33154296875, "step": 750 }, { "epoch": 0.1491658488714426, "grad_norm": 9555.080847284651, "learning_rate": 4.963411139379099e-07, "logits/chosen": -5.049764633178711, "logits/rejected": -5.199952602386475, "logps/chosen": -5243.54248046875, "logps/rejected": -4104.63623046875, "loss": -12.1895, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -4952.9326171875, "rewards/margins": -1108.8414306640625, "rewards/rejected": -3844.091796875, "step": 760 }, { "epoch": 0.15112855740922473, "grad_norm": 12023.588574930149, "learning_rate": 4.960433387920964e-07, "logits/chosen": -5.316850185394287, "logits/rejected": -5.298564910888672, "logps/chosen": -4310.51318359375, "logps/rejected": -7034.9072265625, "loss": -24.8207, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -4140.767578125, "rewards/margins": 2601.36279296875, "rewards/rejected": -6742.1298828125, "step": 770 }, { "epoch": 0.15309126594700687, "grad_norm": 7784.409805357992, "learning_rate": 4.957340123432271e-07, "logits/chosen": -4.923598289489746, "logits/rejected": -5.511407852172852, "logps/chosen": -4989.72509765625, "logps/rejected": -6824.4716796875, "loss": -20.4824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4657.15673828125, "rewards/margins": 1935.213623046875, "rewards/rejected": -6592.3701171875, "step": 780 }, { "epoch": 0.155053974484789, "grad_norm": 19605.6891507753, "learning_rate": 4.954131491136361e-07, "logits/chosen": -5.60308837890625, "logits/rejected": -5.888776779174805, "logps/chosen": -6126.0810546875, "logps/rejected": -8215.982421875, "loss": -22.0307, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -5800.68701171875, "rewards/margins": 2126.65966796875, "rewards/rejected": -7927.34765625, "step": 790 }, { "epoch": 0.15701668302257116, "grad_norm": 9567.250950643886, "learning_rate": 4.95080764167289e-07, "logits/chosen": -5.4140543937683105, "logits/rejected": -5.838421821594238, "logps/chosen": -3287.478515625, "logps/rejected": -10459.359375, "loss": -46.221, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -3037.16162109375, "rewards/margins": 7148.86181640625, "rewards/rejected": -10186.0234375, "step": 800 }, { "epoch": 0.1589793915603533, "grad_norm": 5031.798106614742, "learning_rate": 4.94736873109076e-07, "logits/chosen": -5.7605485916137695, "logits/rejected": -6.0007100105285645, "logps/chosen": -8628.2353515625, "logps/rejected": -10313.3779296875, "loss": -14.6818, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8375.9248046875, "rewards/margins": 1705.6243896484375, "rewards/rejected": -10081.5498046875, "step": 810 }, { "epoch": 0.16094210009813542, "grad_norm": 14281.451020890998, "learning_rate": 4.943814920840787e-07, "logits/chosen": -5.832943439483643, "logits/rejected": -5.913120746612549, "logps/chosen": -5706.1865234375, "logps/rejected": -7254.32958984375, "loss": -3.8791, "rewards/accuracies": 0.5, "rewards/chosen": -5427.99267578125, "rewards/margins": 1560.8138427734375, "rewards/rejected": -6988.80615234375, "step": 820 }, { "epoch": 0.16290480863591755, "grad_norm": 25219.950961219838, "learning_rate": 4.940146377768126e-07, "logits/chosen": -5.86264705657959, "logits/rejected": -6.38746976852417, "logps/chosen": -8420.1923828125, "logps/rejected": -11860.4892578125, "loss": -11.7428, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -8132.7919921875, "rewards/margins": 3497.96923828125, "rewards/rejected": -11630.76171875, "step": 830 }, { "epoch": 0.1648675171736997, "grad_norm": 11820.38891026719, "learning_rate": 4.936363274104441e-07, "logits/chosen": -5.593775749206543, "logits/rejected": -6.003030776977539, "logps/chosen": -9974.05859375, "logps/rejected": -10679.541015625, "loss": -29.4741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9687.638671875, "rewards/margins": 765.0916137695312, "rewards/rejected": -10452.7314453125, "step": 840 }, { "epoch": 0.16683022571148184, "grad_norm": 5651.031721123486, "learning_rate": 4.932465787459808e-07, "logits/chosen": -5.6370697021484375, "logits/rejected": -5.985367774963379, "logps/chosen": -5189.21728515625, "logps/rejected": -7923.3916015625, "loss": -28.2714, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -4923.140625, "rewards/margins": 2747.0888671875, "rewards/rejected": -7670.2294921875, "step": 850 }, { "epoch": 0.16879293424926398, "grad_norm": 4514.622453035337, "learning_rate": 4.92845410081439e-07, "logits/chosen": -5.711178779602051, "logits/rejected": -6.145454406738281, "logps/chosen": -3792.75390625, "logps/rejected": -7190.0234375, "loss": -19.6348, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -3533.87109375, "rewards/margins": 3372.494140625, "rewards/rejected": -6906.36572265625, "step": 860 }, { "epoch": 0.17075564278704614, "grad_norm": 13714.479430392348, "learning_rate": 4.924328402509833e-07, "logits/chosen": -6.179508209228516, "logits/rejected": -6.293606758117676, "logps/chosen": -6443.6865234375, "logps/rejected": -5546.8642578125, "loss": -6.9228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6169.89794921875, "rewards/margins": -860.1452026367188, "rewards/rejected": -5309.7529296875, "step": 870 }, { "epoch": 0.17271835132482827, "grad_norm": 7132.283844200657, "learning_rate": 4.920088886240434e-07, "logits/chosen": -5.889359474182129, "logits/rejected": -6.428772926330566, "logps/chosen": -3265.817138671875, "logps/rejected": -6037.52490234375, "loss": -28.6973, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -2981.26220703125, "rewards/margins": 2796.58251953125, "rewards/rejected": -5777.8447265625, "step": 880 }, { "epoch": 0.1746810598626104, "grad_norm": 12477.439205543058, "learning_rate": 4.915735751044045e-07, "logits/chosen": -6.107163429260254, "logits/rejected": -6.2372026443481445, "logps/chosen": -7877.9423828125, "logps/rejected": -8894.537109375, "loss": -31.6577, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -7589.56884765625, "rewards/margins": 1058.4696044921875, "rewards/rejected": -8648.0390625, "step": 890 }, { "epoch": 0.17664376840039253, "grad_norm": 15542.960992387978, "learning_rate": 4.911269201292724e-07, "logits/chosen": -6.000779151916504, "logits/rejected": -6.710317134857178, "logps/chosen": -3909.35546875, "logps/rejected": -10465.1455078125, "loss": -38.5278, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -3614.990966796875, "rewards/margins": 6591.98046875, "rewards/rejected": -10206.9716796875, "step": 900 }, { "epoch": 0.1786064769381747, "grad_norm": 12056.38888908871, "learning_rate": 4.906689446683146e-07, "logits/chosen": -6.439277648925781, "logits/rejected": -6.673526763916016, "logps/chosen": -7288.1708984375, "logps/rejected": -15649.4169921875, "loss": -29.2492, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -7051.0361328125, "rewards/margins": 8281.146484375, "rewards/rejected": -15332.1845703125, "step": 910 }, { "epoch": 0.18056918547595682, "grad_norm": 18457.89236244154, "learning_rate": 4.901996702226755e-07, "logits/chosen": -6.691192626953125, "logits/rejected": -6.481786251068115, "logps/chosen": -7164.22509765625, "logps/rejected": -9232.09765625, "loss": -20.4965, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -6863.5986328125, "rewards/margins": 2030.722900390625, "rewards/rejected": -8894.322265625, "step": 920 }, { "epoch": 0.18253189401373895, "grad_norm": 13185.987368026217, "learning_rate": 4.897191188239667e-07, "logits/chosen": -5.662405967712402, "logits/rejected": -6.346382141113281, "logps/chosen": -4762.76171875, "logps/rejected": -4395.76318359375, "loss": -12.6953, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -4438.7255859375, "rewards/margins": -254.1382598876953, "rewards/rejected": -4184.5869140625, "step": 930 }, { "epoch": 0.1844946025515211, "grad_norm": 9410.580889938821, "learning_rate": 4.892273130332334e-07, "logits/chosen": -5.764897346496582, "logits/rejected": -6.0113205909729, "logps/chosen": -6139.517578125, "logps/rejected": -8035.6376953125, "loss": -15.2011, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -5811.48486328125, "rewards/margins": 1864.588623046875, "rewards/rejected": -7676.0732421875, "step": 940 }, { "epoch": 0.18645731108930325, "grad_norm": 6342.7586723466875, "learning_rate": 4.887242759398945e-07, "logits/chosen": -5.898398399353027, "logits/rejected": -6.63924503326416, "logps/chosen": -3808.279296875, "logps/rejected": -9303.75390625, "loss": -26.9573, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -3610.69482421875, "rewards/margins": 5499.0751953125, "rewards/rejected": -9109.7705078125, "step": 950 }, { "epoch": 0.18842001962708538, "grad_norm": 11963.022793219014, "learning_rate": 4.88210031160659e-07, "logits/chosen": -6.416249752044678, "logits/rejected": -6.384464263916016, "logps/chosen": -10369.8701171875, "logps/rejected": -12422.373046875, "loss": -12.8583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10103.103515625, "rewards/margins": 2066.58056640625, "rewards/rejected": -12169.6845703125, "step": 960 }, { "epoch": 0.1903827281648675, "grad_norm": 10236.943532320936, "learning_rate": 4.876846028384169e-07, "logits/chosen": -6.297173023223877, "logits/rejected": -6.415545463562012, "logps/chosen": -7671.2529296875, "logps/rejected": -10869.4267578125, "loss": -41.6121, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7467.36865234375, "rewards/margins": 3166.47607421875, "rewards/rejected": -10633.8447265625, "step": 970 }, { "epoch": 0.19234543670264967, "grad_norm": 3993.1790669278917, "learning_rate": 4.87148015641106e-07, "logits/chosen": -6.627077579498291, "logits/rejected": -6.740579128265381, "logps/chosen": -10304.3701171875, "logps/rejected": -12665.5478515625, "loss": -29.8854, "rewards/accuracies": 0.5, "rewards/chosen": -10053.3193359375, "rewards/margins": 2337.34716796875, "rewards/rejected": -12390.666015625, "step": 980 }, { "epoch": 0.1943081452404318, "grad_norm": 17480.574536648135, "learning_rate": 4.866002947605539e-07, "logits/chosen": -6.591230869293213, "logits/rejected": -7.154505729675293, "logps/chosen": -10526.505859375, "logps/rejected": -14869.9345703125, "loss": -30.0988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10289.6123046875, "rewards/margins": 4337.68896484375, "rewards/rejected": -14627.302734375, "step": 990 }, { "epoch": 0.19627085377821393, "grad_norm": 18835.090181075317, "learning_rate": 4.860414659112948e-07, "logits/chosen": -5.732474327087402, "logits/rejected": -6.906656742095947, "logps/chosen": -4464.4931640625, "logps/rejected": -8855.4443359375, "loss": -28.0699, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4226.84423828125, "rewards/margins": 4435.8427734375, "rewards/rejected": -8662.6865234375, "step": 1000 }, { "epoch": 0.19823356231599606, "grad_norm": 14912.744857224145, "learning_rate": 4.854715553293627e-07, "logits/chosen": -5.851419925689697, "logits/rejected": -6.499886989593506, "logps/chosen": -3528.130126953125, "logps/rejected": -3690.87158203125, "loss": -16.9536, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -3213.92919921875, "rewards/margins": 274.3960876464844, "rewards/rejected": -3488.324951171875, "step": 1010 }, { "epoch": 0.20019627085377822, "grad_norm": 18008.596576282962, "learning_rate": 4.848905897710595e-07, "logits/chosen": -6.312150001525879, "logits/rejected": -7.0706353187561035, "logps/chosen": -7834.62744140625, "logps/rejected": -9298.2373046875, "loss": -23.2271, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7523.5634765625, "rewards/margins": 1557.814697265625, "rewards/rejected": -9081.37890625, "step": 1020 }, { "epoch": 0.20215897939156036, "grad_norm": 28163.76736229375, "learning_rate": 4.842985965116987e-07, "logits/chosen": -6.157286643981934, "logits/rejected": -6.599762916564941, "logps/chosen": -9029.1123046875, "logps/rejected": -9565.55859375, "loss": -18.8913, "rewards/accuracies": 0.533333420753479, "rewards/chosen": -8688.0771484375, "rewards/margins": 595.7215576171875, "rewards/rejected": -9283.798828125, "step": 1030 }, { "epoch": 0.2041216879293425, "grad_norm": 18398.085147750124, "learning_rate": 4.836956033443253e-07, "logits/chosen": -6.422955513000488, "logits/rejected": -6.9488205909729, "logps/chosen": -6672.7880859375, "logps/rejected": -11461.138671875, "loss": -36.4218, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6305.60595703125, "rewards/margins": 4774.3662109375, "rewards/rejected": -11079.9716796875, "step": 1040 }, { "epoch": 0.20608439646712462, "grad_norm": 33799.680292125966, "learning_rate": 4.830816385784104e-07, "logits/chosen": -6.353148460388184, "logits/rejected": -6.8012285232543945, "logps/chosen": -6364.4853515625, "logps/rejected": -10160.666015625, "loss": -15.8248, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6056.6572265625, "rewards/margins": 3825.34716796875, "rewards/rejected": -9882.0029296875, "step": 1050 }, { "epoch": 0.20804710500490678, "grad_norm": 27435.164709135457, "learning_rate": 4.824567310385226e-07, "logits/chosen": -6.066521644592285, "logits/rejected": -6.341080665588379, "logps/chosen": -3572.680908203125, "logps/rejected": -5190.095703125, "loss": -17.2576, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -3255.31103515625, "rewards/margins": 1672.730712890625, "rewards/rejected": -4928.04150390625, "step": 1060 }, { "epoch": 0.2100098135426889, "grad_norm": 19323.966688241275, "learning_rate": 4.818209100629744e-07, "logits/chosen": -6.24301815032959, "logits/rejected": -6.491910457611084, "logps/chosen": -6063.89599609375, "logps/rejected": -9733.435546875, "loss": -28.2124, "rewards/accuracies": 0.533333420753479, "rewards/chosen": -5815.2255859375, "rewards/margins": 3647.280517578125, "rewards/rejected": -9462.5068359375, "step": 1070 }, { "epoch": 0.21197252208047104, "grad_norm": 9947.385414651284, "learning_rate": 4.81174205502445e-07, "logits/chosen": -6.536830902099609, "logits/rejected": -6.754624366760254, "logps/chosen": -4830.169921875, "logps/rejected": -7181.6748046875, "loss": -36.5645, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -4608.97021484375, "rewards/margins": 2361.175048828125, "rewards/rejected": -6970.1455078125, "step": 1080 }, { "epoch": 0.2139352306182532, "grad_norm": 7003.663363553055, "learning_rate": 4.80516647718579e-07, "logits/chosen": -6.829509735107422, "logits/rejected": -6.8753252029418945, "logps/chosen": -9009.7744140625, "logps/rejected": -9554.724609375, "loss": -10.8078, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8770.001953125, "rewards/margins": 530.52978515625, "rewards/rejected": -9300.529296875, "step": 1090 }, { "epoch": 0.21589793915603533, "grad_norm": 20506.944638987403, "learning_rate": 4.798482675825602e-07, "logits/chosen": -5.99752950668335, "logits/rejected": -6.124609470367432, "logps/chosen": -4564.5078125, "logps/rejected": -6626.99462890625, "loss": -33.0573, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -4356.3115234375, "rewards/margins": 2022.443359375, "rewards/rejected": -6378.75439453125, "step": 1100 }, { "epoch": 0.21786064769381747, "grad_norm": 23211.377391117898, "learning_rate": 4.791690964736636e-07, "logits/chosen": -6.223307132720947, "logits/rejected": -6.656490325927734, "logps/chosen": -6796.92578125, "logps/rejected": -8325.5654296875, "loss": -17.0997, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6485.1552734375, "rewards/margins": 1625.4010009765625, "rewards/rejected": -8110.55615234375, "step": 1110 }, { "epoch": 0.2198233562315996, "grad_norm": 16428.257674128483, "learning_rate": 4.78479166277781e-07, "logits/chosen": -6.288302898406982, "logits/rejected": -7.060388088226318, "logps/chosen": -6896.67822265625, "logps/rejected": -12242.1630859375, "loss": -20.4254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6565.47119140625, "rewards/margins": 5391.685546875, "rewards/rejected": -11957.1572265625, "step": 1120 }, { "epoch": 0.22178606476938176, "grad_norm": 12666.640426738013, "learning_rate": 4.777785093859247e-07, "logits/chosen": -6.124759197235107, "logits/rejected": -6.8685197830200195, "logps/chosen": -10585.0400390625, "logps/rejected": -10163.39453125, "loss": -7.5161, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": -10300.375, "rewards/margins": -429.13250732421875, "rewards/rejected": -9871.2412109375, "step": 1130 }, { "epoch": 0.2237487733071639, "grad_norm": 18242.431831497688, "learning_rate": 4.770671586927063e-07, "logits/chosen": -5.312869071960449, "logits/rejected": -6.013340950012207, "logps/chosen": -8856.7392578125, "logps/rejected": -14245.8125, "loss": -41.7453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8499.8701171875, "rewards/margins": 5417.36865234375, "rewards/rejected": -13917.2392578125, "step": 1140 }, { "epoch": 0.22571148184494602, "grad_norm": 12133.937239518376, "learning_rate": 4.7634514759479275e-07, "logits/chosen": -5.536364555358887, "logits/rejected": -6.553144931793213, "logps/chosen": -6002.0673828125, "logps/rejected": -8861.6328125, "loss": -40.6393, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -5716.4072265625, "rewards/margins": 2924.58251953125, "rewards/rejected": -8640.9892578125, "step": 1150 }, { "epoch": 0.22767419038272815, "grad_norm": 12798.515058314108, "learning_rate": 4.7561250998933835e-07, "logits/chosen": -5.443411350250244, "logits/rejected": -6.2736639976501465, "logps/chosen": -3569.50390625, "logps/rejected": -6845.41552734375, "loss": -28.3721, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -3225.18310546875, "rewards/margins": 3405.83349609375, "rewards/rejected": -6631.0166015625, "step": 1160 }, { "epoch": 0.2296368989205103, "grad_norm": 11981.948459611636, "learning_rate": 4.7486928027239304e-07, "logits/chosen": -6.173932075500488, "logits/rejected": -6.705090522766113, "logps/chosen": -3848.798828125, "logps/rejected": -8102.4208984375, "loss": -35.9646, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -3643.14990234375, "rewards/margins": 4247.37451171875, "rewards/rejected": -7890.5244140625, "step": 1170 }, { "epoch": 0.23159960745829244, "grad_norm": 14439.768214547857, "learning_rate": 4.7411549333728807e-07, "logits/chosen": -6.110247611999512, "logits/rejected": -6.248814582824707, "logps/chosen": -8091.17822265625, "logps/rejected": -11718.095703125, "loss": -38.4208, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7812.70458984375, "rewards/margins": 3621.135986328125, "rewards/rejected": -11433.8408203125, "step": 1180 }, { "epoch": 0.23356231599607458, "grad_norm": 14137.963281773538, "learning_rate": 4.7335118457299756e-07, "logits/chosen": -6.655389308929443, "logits/rejected": -7.13067102432251, "logps/chosen": -7468.5830078125, "logps/rejected": -14053.486328125, "loss": -21.7755, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -7180.61181640625, "rewards/margins": 6611.4443359375, "rewards/rejected": -13792.056640625, "step": 1190 }, { "epoch": 0.23552502453385674, "grad_norm": 8780.903475366407, "learning_rate": 4.7257638986247684e-07, "logits/chosen": -6.206356048583984, "logits/rejected": -6.580771446228027, "logps/chosen": -9771.921875, "logps/rejected": -13752.0341796875, "loss": -40.6361, "rewards/accuracies": 0.533333420753479, "rewards/chosen": -9474.7236328125, "rewards/margins": 3932.257080078125, "rewards/rejected": -13406.9814453125, "step": 1200 }, { "epoch": 0.23748773307163887, "grad_norm": 11027.404658116602, "learning_rate": 4.7179114558097814e-07, "logits/chosen": -6.383017063140869, "logits/rejected": -7.095811367034912, "logps/chosen": -10146.6376953125, "logps/rejected": -11640.4931640625, "loss": -31.9294, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -9887.333984375, "rewards/margins": 1548.2554931640625, "rewards/rejected": -11435.5888671875, "step": 1210 }, { "epoch": 0.239450441609421, "grad_norm": 13218.648713252847, "learning_rate": 4.709954885943428e-07, "logits/chosen": -6.283883094787598, "logits/rejected": -6.7995758056640625, "logps/chosen": -6759.9208984375, "logps/rejected": -10460.041015625, "loss": -33.7623, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -6487.17431640625, "rewards/margins": 3755.44677734375, "rewards/rejected": -10242.6201171875, "step": 1220 }, { "epoch": 0.24141315014720313, "grad_norm": 18165.2606801322, "learning_rate": 4.7018945625727026e-07, "logits/chosen": -6.002166748046875, "logits/rejected": -6.142951488494873, "logps/chosen": -9446.447265625, "logps/rejected": -9355.5888671875, "loss": -21.4024, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -9168.755859375, "rewards/margins": -82.42424011230469, "rewards/rejected": -9086.3310546875, "step": 1230 }, { "epoch": 0.2433758586849853, "grad_norm": 9546.486782653119, "learning_rate": 4.6937308641156447e-07, "logits/chosen": -6.328946113586426, "logits/rejected": -6.592848300933838, "logps/chosen": -4691.10546875, "logps/rejected": -6811.04052734375, "loss": -21.3475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4481.0068359375, "rewards/margins": 2117.7255859375, "rewards/rejected": -6598.73291015625, "step": 1240 }, { "epoch": 0.24533856722276742, "grad_norm": 15198.297322344595, "learning_rate": 4.685464173843574e-07, "logits/chosen": -5.786673545837402, "logits/rejected": -6.3122358322143555, "logps/chosen": -5133.77197265625, "logps/rejected": -10014.5078125, "loss": -23.3859, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -4899.78076171875, "rewards/margins": 4859.4150390625, "rewards/rejected": -9759.1953125, "step": 1250 }, { "epoch": 0.24730127576054955, "grad_norm": 24930.90080464163, "learning_rate": 4.677094879863093e-07, "logits/chosen": -6.168011665344238, "logits/rejected": -6.7909369468688965, "logps/chosen": -7076.28125, "logps/rejected": -9543.931640625, "loss": -43.9456, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6827.8720703125, "rewards/margins": 2499.33056640625, "rewards/rejected": -9327.2021484375, "step": 1260 }, { "epoch": 0.2492639842983317, "grad_norm": 10424.227549193865, "learning_rate": 4.66862337509787e-07, "logits/chosen": -6.140045642852783, "logits/rejected": -6.81331729888916, "logps/chosen": -7571.5986328125, "logps/rejected": -10816.103515625, "loss": -29.6378, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7250.7734375, "rewards/margins": 3345.135986328125, "rewards/rejected": -10595.91015625, "step": 1270 }, { "epoch": 0.2512266928361138, "grad_norm": 5836.998988152928, "learning_rate": 4.660050057270191e-07, "logits/chosen": -6.291211128234863, "logits/rejected": -7.2217512130737305, "logps/chosen": -5159.23876953125, "logps/rejected": -9405.814453125, "loss": -40.6205, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -4903.15087890625, "rewards/margins": 4288.626953125, "rewards/rejected": -9191.7783203125, "step": 1280 }, { "epoch": 0.25318940137389595, "grad_norm": 11244.027822828177, "learning_rate": 4.6513753288822833e-07, "logits/chosen": -6.402543067932129, "logits/rejected": -6.612940788269043, "logps/chosen": -6702.81787109375, "logps/rejected": -9002.962890625, "loss": -21.3048, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -6543.5166015625, "rewards/margins": 2286.71142578125, "rewards/rejected": -8830.2275390625, "step": 1290 }, { "epoch": 0.25515210991167814, "grad_norm": 8233.06526790705, "learning_rate": 4.6425995971974265e-07, "logits/chosen": -6.400839328765869, "logits/rejected": -6.700913429260254, "logps/chosen": -5469.6318359375, "logps/rejected": -6716.37109375, "loss": -22.5222, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -5186.34814453125, "rewards/margins": 1313.0029296875, "rewards/rejected": -6499.3505859375, "step": 1300 }, { "epoch": 0.25711481844946027, "grad_norm": 11206.103743894755, "learning_rate": 4.633723274220824e-07, "logits/chosen": -6.092984199523926, "logits/rejected": -6.375741004943848, "logps/chosen": -6886.3330078125, "logps/rejected": -10701.6572265625, "loss": -32.5831, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -6589.3984375, "rewards/margins": 3788.77001953125, "rewards/rejected": -10378.1689453125, "step": 1310 }, { "epoch": 0.2590775269872424, "grad_norm": 7519.232389352001, "learning_rate": 4.624746776680267e-07, "logits/chosen": -6.1906304359436035, "logits/rejected": -6.620635986328125, "logps/chosen": -4721.166015625, "logps/rejected": -8448.7783203125, "loss": -20.0555, "rewards/accuracies": 0.5666666030883789, "rewards/chosen": -4472.5859375, "rewards/margins": 3729.46484375, "rewards/rejected": -8202.0498046875, "step": 1320 }, { "epoch": 0.26104023552502453, "grad_norm": 34111.858435326576, "learning_rate": 4.6156705260065634e-07, "logits/chosen": -5.957228660583496, "logits/rejected": -6.375521659851074, "logps/chosen": -8008.7978515625, "logps/rejected": -14257.080078125, "loss": -28.6648, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -7799.6240234375, "rewards/margins": 6232.3212890625, "rewards/rejected": -14031.943359375, "step": 1330 }, { "epoch": 0.26300294406280667, "grad_norm": 12185.636440001159, "learning_rate": 4.606494948313758e-07, "logits/chosen": -6.902766227722168, "logits/rejected": -7.007483005523682, "logps/chosen": -10270.865234375, "logps/rejected": -10062.900390625, "loss": -21.7692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10028.7548828125, "rewards/margins": -221.51220703125, "rewards/rejected": -9807.2412109375, "step": 1340 }, { "epoch": 0.2649656526005888, "grad_norm": 13731.211189405232, "learning_rate": 4.597220474379125e-07, "logits/chosen": -6.251508712768555, "logits/rejected": -6.584108829498291, "logps/chosen": -7746.6455078125, "logps/rejected": -12107.892578125, "loss": -23.2958, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -7419.36962890625, "rewards/margins": 4365.3662109375, "rewards/rejected": -11784.736328125, "step": 1350 }, { "epoch": 0.26692836113837093, "grad_norm": 10659.395704841198, "learning_rate": 4.587847539622942e-07, "logits/chosen": -5.970205783843994, "logits/rejected": -7.021003723144531, "logps/chosen": -6225.58251953125, "logps/rejected": -14322.064453125, "loss": -56.8601, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5847.4248046875, "rewards/margins": 8151.3681640625, "rewards/rejected": -13998.7919921875, "step": 1360 }, { "epoch": 0.2688910696761531, "grad_norm": 15133.877333017912, "learning_rate": 4.5783765840880505e-07, "logits/chosen": -5.978967189788818, "logits/rejected": -6.6390204429626465, "logps/chosen": -5109.75244140625, "logps/rejected": -8415.6962890625, "loss": -36.5043, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -4781.7255859375, "rewards/margins": 3327.212158203125, "rewards/rejected": -8108.9384765625, "step": 1370 }, { "epoch": 0.27085377821393525, "grad_norm": 11078.681062419017, "learning_rate": 4.568808052419196e-07, "logits/chosen": -6.110931873321533, "logits/rejected": -6.497220039367676, "logps/chosen": -4203.34375, "logps/rejected": -6042.4677734375, "loss": -34.1511, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": -3978.739501953125, "rewards/margins": 1846.137939453125, "rewards/rejected": -5824.87744140625, "step": 1380 }, { "epoch": 0.2728164867517174, "grad_norm": 16672.186676961843, "learning_rate": 4.5591423938421513e-07, "logits/chosen": -6.6743950843811035, "logits/rejected": -7.108180999755859, "logps/chosen": -9725.7451171875, "logps/rejected": -9587.6318359375, "loss": -9.8981, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -9441.880859375, "rewards/margins": -112.7349624633789, "rewards/rejected": -9329.1455078125, "step": 1390 }, { "epoch": 0.2747791952894995, "grad_norm": 13322.370523019676, "learning_rate": 4.549380062142627e-07, "logits/chosen": -6.0096049308776855, "logits/rejected": -6.467965602874756, "logps/chosen": -6923.0390625, "logps/rejected": -8913.1962890625, "loss": -37.9187, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6662.5302734375, "rewards/margins": 1956.9078369140625, "rewards/rejected": -8619.4384765625, "step": 1400 }, { "epoch": 0.27674190382728164, "grad_norm": 12271.731568935813, "learning_rate": 4.5395215156449683e-07, "logits/chosen": -6.329373359680176, "logits/rejected": -6.810887813568115, "logps/chosen": -7398.0087890625, "logps/rejected": -14594.138671875, "loss": -51.2084, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -7095.1318359375, "rewards/margins": 7149.91162109375, "rewards/rejected": -14245.0439453125, "step": 1410 }, { "epoch": 0.2787046123650638, "grad_norm": 10072.985913007707, "learning_rate": 4.5295672171906365e-07, "logits/chosen": -6.342528820037842, "logits/rejected": -7.2635040283203125, "logps/chosen": -7996.9677734375, "logps/rejected": -10459.896484375, "loss": -20.8456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7731.5029296875, "rewards/margins": 2500.05517578125, "rewards/rejected": -10231.55859375, "step": 1420 }, { "epoch": 0.2806673209028459, "grad_norm": 22792.211287340782, "learning_rate": 4.5195176341164765e-07, "logits/chosen": -6.525473117828369, "logits/rejected": -7.110024452209473, "logps/chosen": -10503.7509765625, "logps/rejected": -16025.591796875, "loss": -30.5353, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -10235.2626953125, "rewards/margins": 5495.48876953125, "rewards/rejected": -15730.751953125, "step": 1430 }, { "epoch": 0.2826300294406281, "grad_norm": 12539.316588898228, "learning_rate": 4.509373238232782e-07, "logits/chosen": -6.396284103393555, "logits/rejected": -6.716331481933594, "logps/chosen": -13404.931640625, "logps/rejected": -8376.072265625, "loss": -13.1888, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": -13106.119140625, "rewards/margins": -4963.8447265625, "rewards/rejected": -8142.2744140625, "step": 1440 }, { "epoch": 0.2845927379784102, "grad_norm": 6348.043827008181, "learning_rate": 4.499134505801141e-07, "logits/chosen": -6.6794281005859375, "logits/rejected": -7.102025032043457, "logps/chosen": -6077.06982421875, "logps/rejected": -10695.982421875, "loss": -44.291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5846.06884765625, "rewards/margins": 4597.7177734375, "rewards/rejected": -10443.787109375, "step": 1450 }, { "epoch": 0.28655544651619236, "grad_norm": 12110.979534488302, "learning_rate": 4.488801917512076e-07, "logits/chosen": -6.6333184242248535, "logits/rejected": -6.957442283630371, "logps/chosen": -9736.861328125, "logps/rejected": -16899.50390625, "loss": -41.1026, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -9447.439453125, "rewards/margins": 7115.53515625, "rewards/rejected": -16562.97265625, "step": 1460 }, { "epoch": 0.2885181550539745, "grad_norm": 14812.101379513399, "learning_rate": 4.478375958462479e-07, "logits/chosen": -6.318536758422852, "logits/rejected": -7.031842231750488, "logps/chosen": -10322.5283203125, "logps/rejected": -12935.365234375, "loss": -46.0894, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -10023.6171875, "rewards/margins": 2686.2734375, "rewards/rejected": -12709.8916015625, "step": 1470 }, { "epoch": 0.2904808635917566, "grad_norm": 16676.396023043104, "learning_rate": 4.467857118132833e-07, "logits/chosen": -6.188724994659424, "logits/rejected": -6.502559661865234, "logps/chosen": -9485.626953125, "logps/rejected": -11656.0439453125, "loss": -29.7157, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": -9238.7138671875, "rewards/margins": 2173.238525390625, "rewards/rejected": -11411.953125, "step": 1480 }, { "epoch": 0.29244357212953875, "grad_norm": 19085.25631293807, "learning_rate": 4.457245890364235e-07, "logits/chosen": -6.672650337219238, "logits/rejected": -7.326931953430176, "logps/chosen": -13780.701171875, "logps/rejected": -13598.59375, "loss": -2.1853, "rewards/accuracies": 0.5, "rewards/chosen": -13431.1357421875, "rewards/margins": -96.58708190917969, "rewards/rejected": -13334.548828125, "step": 1490 }, { "epoch": 0.2944062806673209, "grad_norm": 45650.718005735966, "learning_rate": 4.4465427733352124e-07, "logits/chosen": -6.584973335266113, "logits/rejected": -6.974669456481934, "logps/chosen": -11123.3603515625, "logps/rejected": -16306.6015625, "loss": -41.7994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10840.716796875, "rewards/margins": 5193.3505859375, "rewards/rejected": -16034.0673828125, "step": 1500 }, { "epoch": 0.296368989205103, "grad_norm": 13771.953160285857, "learning_rate": 4.43574826953833e-07, "logits/chosen": -5.8255109786987305, "logits/rejected": -6.7153778076171875, "logps/chosen": -7739.87353515625, "logps/rejected": -15156.265625, "loss": -27.5819, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -7428.53271484375, "rewards/margins": 7401.8173828125, "rewards/rejected": -14830.349609375, "step": 1510 }, { "epoch": 0.2983316977428852, "grad_norm": 17997.580041498233, "learning_rate": 4.4248628857565997e-07, "logits/chosen": -5.705665588378906, "logits/rejected": -6.956638336181641, "logps/chosen": -3664.171142578125, "logps/rejected": -10735.1962890625, "loss": -37.9201, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -3327.84521484375, "rewards/margins": 7194.93896484375, "rewards/rejected": -10522.783203125, "step": 1520 }, { "epoch": 0.30029440628066734, "grad_norm": 9925.005255771282, "learning_rate": 4.413887133039692e-07, "logits/chosen": -5.839178085327148, "logits/rejected": -7.049687385559082, "logps/chosen": -8153.58349609375, "logps/rejected": -11363.951171875, "loss": -19.873, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": -7743.95068359375, "rewards/margins": 3345.352294921875, "rewards/rejected": -11089.302734375, "step": 1530 }, { "epoch": 0.30225711481844947, "grad_norm": 24890.57032898622, "learning_rate": 4.4028215266799395e-07, "logits/chosen": -6.093216419219971, "logits/rejected": -7.097301483154297, "logps/chosen": -6767.65869140625, "logps/rejected": -10212.78515625, "loss": -40.5102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6494.53369140625, "rewards/margins": 3496.70361328125, "rewards/rejected": -9991.2373046875, "step": 1540 }, { "epoch": 0.3042198233562316, "grad_norm": 14981.894505495227, "learning_rate": 4.391666586188145e-07, "logits/chosen": -6.443569183349609, "logits/rejected": -6.796970367431641, "logps/chosen": -8591.0263671875, "logps/rejected": -8831.005859375, "loss": -19.9056, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -8378.328125, "rewards/margins": 226.43017578125, "rewards/rejected": -8604.7587890625, "step": 1550 }, { "epoch": 0.30618253189401373, "grad_norm": 3921.9662901535403, "learning_rate": 4.380422835269193e-07, "logits/chosen": -6.219468116760254, "logits/rejected": -6.6846513748168945, "logps/chosen": -9859.1689453125, "logps/rejected": -14192.310546875, "loss": -28.5812, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9605.193359375, "rewards/margins": 4320.884765625, "rewards/rejected": -13926.078125, "step": 1560 }, { "epoch": 0.30814524043179586, "grad_norm": 18477.71999076715, "learning_rate": 4.3690908017974596e-07, "logits/chosen": -6.090170383453369, "logits/rejected": -6.5510149002075195, "logps/chosen": -5316.6904296875, "logps/rejected": -11094.0419921875, "loss": -16.2614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5090.953125, "rewards/margins": 5729.7958984375, "rewards/rejected": -10820.748046875, "step": 1570 }, { "epoch": 0.310107948969578, "grad_norm": 44440.960196552594, "learning_rate": 4.3576710177920356e-07, "logits/chosen": -6.613284111022949, "logits/rejected": -6.795392036437988, "logps/chosen": -10145.8076171875, "logps/rejected": -9258.9296875, "loss": -21.2526, "rewards/accuracies": 0.29999998211860657, "rewards/chosen": -9909.3984375, "rewards/margins": -881.1222534179688, "rewards/rejected": -9028.27734375, "step": 1580 }, { "epoch": 0.3120706575073602, "grad_norm": 16684.28830140781, "learning_rate": 4.346164019391742e-07, "logits/chosen": -6.741615295410156, "logits/rejected": -7.19757080078125, "logps/chosen": -14124.763671875, "logps/rejected": -15103.205078125, "loss": -38.9755, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -13763.431640625, "rewards/margins": 1006.6990356445312, "rewards/rejected": -14770.1298828125, "step": 1590 }, { "epoch": 0.3140333660451423, "grad_norm": 14967.963879925434, "learning_rate": 4.3345703468299634e-07, "logits/chosen": -6.714296817779541, "logits/rejected": -6.952079772949219, "logps/chosen": -9553.373046875, "logps/rejected": -14918.1953125, "loss": -18.2398, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -9275.6455078125, "rewards/margins": 5371.67431640625, "rewards/rejected": -14647.3203125, "step": 1600 }, { "epoch": 0.31599607458292445, "grad_norm": 18034.940069197684, "learning_rate": 4.322890544409286e-07, "logits/chosen": -5.766831874847412, "logits/rejected": -7.157978057861328, "logps/chosen": -4744.494140625, "logps/rejected": -8146.5439453125, "loss": -43.0487, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4430.04541015625, "rewards/margins": 3457.79345703125, "rewards/rejected": -7887.83837890625, "step": 1610 }, { "epoch": 0.3179587831207066, "grad_norm": 28913.987933550867, "learning_rate": 4.311125160475938e-07, "logits/chosen": -6.155313014984131, "logits/rejected": -6.643632411956787, "logps/chosen": -5253.39404296875, "logps/rejected": -9396.46484375, "loss": -29.2824, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4981.29345703125, "rewards/margins": 4062.660888671875, "rewards/rejected": -9043.955078125, "step": 1620 }, { "epoch": 0.3199214916584887, "grad_norm": 13126.370013071713, "learning_rate": 4.299274747394055e-07, "logits/chosen": -6.353914260864258, "logits/rejected": -6.803383827209473, "logps/chosen": -6507.6142578125, "logps/rejected": -10709.69921875, "loss": -58.5521, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -6241.24951171875, "rewards/margins": 4226.03125, "rewards/rejected": -10467.28125, "step": 1630 }, { "epoch": 0.32188420019627084, "grad_norm": 40439.42790460046, "learning_rate": 4.287339861519737e-07, "logits/chosen": -6.636476039886475, "logits/rejected": -7.388413429260254, "logps/chosen": -9108.7060546875, "logps/rejected": -15897.78125, "loss": -47.8094, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -8809.01171875, "rewards/margins": 6805.90087890625, "rewards/rejected": -15614.912109375, "step": 1640 }, { "epoch": 0.323846908734053, "grad_norm": 26307.3568750265, "learning_rate": 4.275321063174936e-07, "logits/chosen": -6.471556663513184, "logits/rejected": -7.147758483886719, "logps/chosen": -10961.802734375, "logps/rejected": -16991.923828125, "loss": -42.6004, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -10625.1533203125, "rewards/margins": 6111.3076171875, "rewards/rejected": -16736.462890625, "step": 1650 }, { "epoch": 0.3258096172718351, "grad_norm": 15045.747017582844, "learning_rate": 4.2632189166211454e-07, "logits/chosen": -6.399059295654297, "logits/rejected": -6.862725734710693, "logps/chosen": -7297.2861328125, "logps/rejected": -13507.59375, "loss": -45.0948, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -7059.81396484375, "rewards/margins": 6183.8037109375, "rewards/rejected": -13243.619140625, "step": 1660 }, { "epoch": 0.3277723258096173, "grad_norm": 16943.7904322964, "learning_rate": 4.251033990032912e-07, "logits/chosen": -6.304139137268066, "logits/rejected": -6.857525825500488, "logps/chosen": -6581.91162109375, "logps/rejected": -10831.91015625, "loss": -30.3256, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6284.81494140625, "rewards/margins": 4242.73828125, "rewards/rejected": -10527.5517578125, "step": 1670 }, { "epoch": 0.3297350343473994, "grad_norm": 9572.447399548526, "learning_rate": 4.238766855471161e-07, "logits/chosen": -6.504629611968994, "logits/rejected": -6.5430908203125, "logps/chosen": -12809.6513671875, "logps/rejected": -8760.404296875, "loss": -10.2935, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -12462.7685546875, "rewards/margins": -3937.419921875, "rewards/rejected": -8525.3486328125, "step": 1680 }, { "epoch": 0.33169774288518156, "grad_norm": 32719.102877723126, "learning_rate": 4.226418088856335e-07, "logits/chosen": -5.931424617767334, "logits/rejected": -6.456337928771973, "logps/chosen": -4077.9609375, "logps/rejected": -15109.28125, "loss": -52.9129, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -3816.55712890625, "rewards/margins": 10960.751953125, "rewards/rejected": -14777.3095703125, "step": 1690 }, { "epoch": 0.3336604514229637, "grad_norm": 5435.228867675114, "learning_rate": 4.2139882699413613e-07, "logits/chosen": -6.200502872467041, "logits/rejected": -7.18752384185791, "logps/chosen": -5487.70849609375, "logps/rejected": -8558.9560546875, "loss": -15.2331, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5215.70166015625, "rewards/margins": 3152.622314453125, "rewards/rejected": -8368.32421875, "step": 1700 }, { "epoch": 0.3356231599607458, "grad_norm": 6496.65779297139, "learning_rate": 4.2014779822844274e-07, "logits/chosen": -5.973532676696777, "logits/rejected": -6.415289878845215, "logps/chosen": -5551.83447265625, "logps/rejected": -9404.064453125, "loss": -45.1167, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -5321.79833984375, "rewards/margins": 3826.15966796875, "rewards/rejected": -9147.9580078125, "step": 1710 }, { "epoch": 0.33758586849852795, "grad_norm": 25883.350225803675, "learning_rate": 4.18888781322159e-07, "logits/chosen": -6.446893215179443, "logits/rejected": -7.29486083984375, "logps/chosen": -8645.34765625, "logps/rejected": -11780.076171875, "loss": -32.433, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -8398.310546875, "rewards/margins": 3124.594482421875, "rewards/rejected": -11522.90625, "step": 1720 }, { "epoch": 0.3395485770363101, "grad_norm": 9542.023490864478, "learning_rate": 4.176218353839195e-07, "logits/chosen": -6.123809814453125, "logits/rejected": -6.919741630554199, "logps/chosen": -6543.99853515625, "logps/rejected": -13851.470703125, "loss": -35.4788, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6281.67578125, "rewards/margins": 7359.73291015625, "rewards/rejected": -13641.408203125, "step": 1730 }, { "epoch": 0.34151128557409227, "grad_norm": 10729.13678928476, "learning_rate": 4.1634701989461325e-07, "logits/chosen": -6.191473960876465, "logits/rejected": -6.820995330810547, "logps/chosen": -7325.34228515625, "logps/rejected": -13842.2451171875, "loss": -37.7224, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7056.73193359375, "rewards/margins": 6507.35888671875, "rewards/rejected": -13564.091796875, "step": 1740 }, { "epoch": 0.3434739941118744, "grad_norm": 10946.911117928952, "learning_rate": 4.1506439470459056e-07, "logits/chosen": -6.822333335876465, "logits/rejected": -7.12045955657959, "logps/chosen": -10873.5673828125, "logps/rejected": -13082.49609375, "loss": -8.1428, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -10639.3330078125, "rewards/margins": 2221.7890625, "rewards/rejected": -12861.1220703125, "step": 1750 }, { "epoch": 0.34543670264965654, "grad_norm": 10711.179014733785, "learning_rate": 4.137740200308537e-07, "logits/chosen": -6.126457691192627, "logits/rejected": -6.546261787414551, "logps/chosen": -8865.1162109375, "logps/rejected": -9412.0830078125, "loss": -22.4878, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -8574.724609375, "rewards/margins": 570.0736083984375, "rewards/rejected": -9144.798828125, "step": 1760 }, { "epoch": 0.34739941118743867, "grad_norm": 14342.817948745998, "learning_rate": 4.124759564542295e-07, "logits/chosen": -6.167205810546875, "logits/rejected": -6.387153625488281, "logps/chosen": -8536.2138671875, "logps/rejected": -6905.76318359375, "loss": -23.1284, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -8240.0732421875, "rewards/margins": -1547.017822265625, "rewards/rejected": -6693.05615234375, "step": 1770 }, { "epoch": 0.3493621197252208, "grad_norm": 10155.66425556719, "learning_rate": 4.111702649165255e-07, "logits/chosen": -6.405218601226807, "logits/rejected": -7.158683776855469, "logps/chosen": -9040.25390625, "logps/rejected": -13965.7685546875, "loss": -37.9903, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8788.6064453125, "rewards/margins": 4961.7509765625, "rewards/rejected": -13750.3564453125, "step": 1780 }, { "epoch": 0.35132482826300293, "grad_norm": 31230.39166523614, "learning_rate": 4.0985700671766834e-07, "logits/chosen": -6.116522312164307, "logits/rejected": -7.334207057952881, "logps/chosen": -8519.853515625, "logps/rejected": -15307.6875, "loss": -53.84, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8164.7919921875, "rewards/margins": 6861.2421875, "rewards/rejected": -15026.0341796875, "step": 1790 }, { "epoch": 0.35328753680078506, "grad_norm": 23400.833915922653, "learning_rate": 4.085362435128262e-07, "logits/chosen": -6.229931831359863, "logits/rejected": -7.1061272621154785, "logps/chosen": -7389.0341796875, "logps/rejected": -10866.8408203125, "loss": -29.8333, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -7101.7373046875, "rewards/margins": 3514.470703125, "rewards/rejected": -10616.208984375, "step": 1800 }, { "epoch": 0.35525024533856725, "grad_norm": 7693.018426878932, "learning_rate": 4.0720803730951423e-07, "logits/chosen": -6.362903118133545, "logits/rejected": -7.306522369384766, "logps/chosen": -6278.55908203125, "logps/rejected": -7908.0830078125, "loss": -45.579, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5968.39599609375, "rewards/margins": 1746.5986328125, "rewards/rejected": -7714.99462890625, "step": 1810 }, { "epoch": 0.3572129538763494, "grad_norm": 8475.921331108584, "learning_rate": 4.058724504646834e-07, "logits/chosen": -6.422257900238037, "logits/rejected": -7.1949052810668945, "logps/chosen": -5997.73388671875, "logps/rejected": -10381.0712890625, "loss": -53.097, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5768.95556640625, "rewards/margins": 4384.849609375, "rewards/rejected": -10153.8056640625, "step": 1820 }, { "epoch": 0.3591756624141315, "grad_norm": 16679.6149586889, "learning_rate": 4.045295456817924e-07, "logits/chosen": -6.577358245849609, "logits/rejected": -6.745004177093506, "logps/chosen": -8837.1201171875, "logps/rejected": -8283.861328125, "loss": -39.1987, "rewards/accuracies": 0.533333420753479, "rewards/chosen": -8560.240234375, "rewards/margins": -554.2174682617188, "rewards/rejected": -8006.0244140625, "step": 1830 }, { "epoch": 0.36113837095191365, "grad_norm": 16799.665741154706, "learning_rate": 4.0317938600786484e-07, "logits/chosen": -6.467000484466553, "logits/rejected": -6.836897373199463, "logps/chosen": -8037.5732421875, "logps/rejected": -11516.619140625, "loss": -44.2568, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7742.60400390625, "rewards/margins": 3498.44775390625, "rewards/rejected": -11241.0498046875, "step": 1840 }, { "epoch": 0.3631010794896958, "grad_norm": 14488.801305882504, "learning_rate": 4.0182203483052825e-07, "logits/chosen": -6.976507663726807, "logits/rejected": -7.19774866104126, "logps/chosen": -11419.611328125, "logps/rejected": -11130.490234375, "loss": -24.6815, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -11138.6572265625, "rewards/margins": -213.94052124023438, "rewards/rejected": -10924.716796875, "step": 1850 }, { "epoch": 0.3650637880274779, "grad_norm": 21736.683658713566, "learning_rate": 4.004575558750389e-07, "logits/chosen": -6.617936134338379, "logits/rejected": -7.219670295715332, "logps/chosen": -17619.69921875, "logps/rejected": -15508.1796875, "loss": -15.3986, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": -17269.08984375, "rewards/margins": -2062.36572265625, "rewards/rejected": -15206.724609375, "step": 1860 }, { "epoch": 0.36702649656526004, "grad_norm": 10839.42573023297, "learning_rate": 3.9908601320128976e-07, "logits/chosen": -6.775691032409668, "logits/rejected": -7.080155372619629, "logps/chosen": -7711.4619140625, "logps/rejected": -11161.662109375, "loss": -7.4533, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7474.02197265625, "rewards/margins": 3445.780517578125, "rewards/rejected": -10919.802734375, "step": 1870 }, { "epoch": 0.3689892051030422, "grad_norm": 29009.623316468194, "learning_rate": 3.9770747120080284e-07, "logits/chosen": -6.828773498535156, "logits/rejected": -7.367563724517822, "logps/chosen": -8510.3857421875, "logps/rejected": -15051.3955078125, "loss": -26.4521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8284.8466796875, "rewards/margins": 6557.13134765625, "rewards/rejected": -14841.978515625, "step": 1880 }, { "epoch": 0.37095191364082436, "grad_norm": 8125.767659235967, "learning_rate": 3.963219945937063e-07, "logits/chosen": -7.164325714111328, "logits/rejected": -7.940123558044434, "logps/chosen": -8702.55859375, "logps/rejected": -13414.462890625, "loss": -27.3479, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -8469.38671875, "rewards/margins": 4719.43212890625, "rewards/rejected": -13188.8173828125, "step": 1890 }, { "epoch": 0.3729146221786065, "grad_norm": 20128.852279891194, "learning_rate": 3.949296484256959e-07, "logits/chosen": -6.005627155303955, "logits/rejected": -6.6730499267578125, "logps/chosen": -6029.48876953125, "logps/rejected": -8873.365234375, "loss": -46.7657, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -5774.28369140625, "rewards/margins": 2855.47412109375, "rewards/rejected": -8629.7578125, "step": 1900 }, { "epoch": 0.3748773307163886, "grad_norm": 11611.592042939057, "learning_rate": 3.935304980649813e-07, "logits/chosen": -6.143228054046631, "logits/rejected": -6.767208099365234, "logps/chosen": -6448.12890625, "logps/rejected": -9610.1513671875, "loss": -34.7777, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -6148.0498046875, "rewards/margins": 3182.82080078125, "rewards/rejected": -9330.87109375, "step": 1910 }, { "epoch": 0.37684003925417076, "grad_norm": 11255.948590525091, "learning_rate": 3.92124609199217e-07, "logits/chosen": -6.371559143066406, "logits/rejected": -6.618648529052734, "logps/chosen": -6778.89697265625, "logps/rejected": -9971.0458984375, "loss": -30.0421, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -6574.62255859375, "rewards/margins": 3179.16552734375, "rewards/rejected": -9753.7890625, "step": 1920 }, { "epoch": 0.3788027477919529, "grad_norm": 11540.695137309156, "learning_rate": 3.907120478324185e-07, "logits/chosen": -6.016282558441162, "logits/rejected": -6.250086784362793, "logps/chosen": -5945.47265625, "logps/rejected": -6732.81884765625, "loss": -24.9027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5655.62353515625, "rewards/margins": 806.2496948242188, "rewards/rejected": -6461.87353515625, "step": 1930 }, { "epoch": 0.380765456329735, "grad_norm": 14068.590589225652, "learning_rate": 3.8929288028186364e-07, "logits/chosen": -6.633327484130859, "logits/rejected": -7.0121750831604, "logps/chosen": -7598.9638671875, "logps/rejected": -7784.6025390625, "loss": -45.5425, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7379.9755859375, "rewards/margins": 207.6694793701172, "rewards/rejected": -7587.6455078125, "step": 1940 }, { "epoch": 0.38272816486751715, "grad_norm": 21831.122810701374, "learning_rate": 3.8786717317497875e-07, "logits/chosen": -6.206249713897705, "logits/rejected": -7.164989471435547, "logps/chosen": -5459.9853515625, "logps/rejected": -10850.150390625, "loss": -44.4673, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -5146.31982421875, "rewards/margins": 5433.4560546875, "rewards/rejected": -10579.7763671875, "step": 1950 }, { "epoch": 0.38469087340529934, "grad_norm": 9843.220474080248, "learning_rate": 3.864349934462111e-07, "logits/chosen": -5.770997047424316, "logits/rejected": -6.517005920410156, "logps/chosen": -5309.1689453125, "logps/rejected": -11917.517578125, "loss": -44.0759, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -5046.724609375, "rewards/margins": 6625.8583984375, "rewards/rejected": -11672.583984375, "step": 1960 }, { "epoch": 0.38665358194308147, "grad_norm": 16026.068430521844, "learning_rate": 3.84996408333886e-07, "logits/chosen": -5.9577789306640625, "logits/rejected": -6.7434258460998535, "logps/chosen": -8018.2353515625, "logps/rejected": -9249.0986328125, "loss": -47.4873, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7712.00146484375, "rewards/margins": 1315.7193603515625, "rewards/rejected": -9027.7216796875, "step": 1970 }, { "epoch": 0.3886162904808636, "grad_norm": 23107.49034014634, "learning_rate": 3.8355148537705047e-07, "logits/chosen": -5.355231285095215, "logits/rejected": -6.175726890563965, "logps/chosen": -2374.9765625, "logps/rejected": -7961.8720703125, "loss": -52.2216, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -2144.327392578125, "rewards/margins": 5612.90869140625, "rewards/rejected": -7757.23583984375, "step": 1980 }, { "epoch": 0.39057899901864573, "grad_norm": 17054.10858188501, "learning_rate": 3.8210029241230204e-07, "logits/chosen": -5.818088054656982, "logits/rejected": -6.694204807281494, "logps/chosen": -7754.7197265625, "logps/rejected": -11669.5830078125, "loss": -40.9575, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7406.78271484375, "rewards/margins": 3979.03759765625, "rewards/rejected": -11385.8203125, "step": 1990 }, { "epoch": 0.39254170755642787, "grad_norm": 10408.298207486116, "learning_rate": 3.806428975706042e-07, "logits/chosen": -6.203009605407715, "logits/rejected": -6.5996198654174805, "logps/chosen": -8516.5625, "logps/rejected": -10669.55078125, "loss": -37.0203, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8282.1123046875, "rewards/margins": 2157.823974609375, "rewards/rejected": -10439.935546875, "step": 2000 }, { "epoch": 0.39450441609421, "grad_norm": 36001.742456302505, "learning_rate": 3.791793692740876e-07, "logits/chosen": -6.207632541656494, "logits/rejected": -6.847393989562988, "logps/chosen": -6837.43115234375, "logps/rejected": -8864.228515625, "loss": -27.0665, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -6604.97119140625, "rewards/margins": 2090.73583984375, "rewards/rejected": -8695.70703125, "step": 2010 }, { "epoch": 0.39646712463199213, "grad_norm": 25425.679475889854, "learning_rate": 3.777097762328381e-07, "logits/chosen": -6.223984241485596, "logits/rejected": -6.718271732330322, "logps/chosen": -6741.27490234375, "logps/rejected": -10395.0341796875, "loss": -47.8121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6444.88623046875, "rewards/margins": 3689.50537109375, "rewards/rejected": -10134.3916015625, "step": 2020 }, { "epoch": 0.39842983316977426, "grad_norm": 10090.93407714648, "learning_rate": 3.762341874416702e-07, "logits/chosen": -6.204736232757568, "logits/rejected": -6.861325263977051, "logps/chosen": -7979.85546875, "logps/rejected": -8932.216796875, "loss": -17.6387, "rewards/accuracies": 0.5666666030883789, "rewards/chosen": -7752.3955078125, "rewards/margins": 1016.1131591796875, "rewards/rejected": -8768.5078125, "step": 2030 }, { "epoch": 0.40039254170755645, "grad_norm": 12057.72704145848, "learning_rate": 3.7475267217688896e-07, "logits/chosen": -6.268139839172363, "logits/rejected": -6.780392646789551, "logps/chosen": -5533.49169921875, "logps/rejected": -14908.111328125, "loss": -46.0538, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -5333.0732421875, "rewards/margins": 9322.2080078125, "rewards/rejected": -14655.279296875, "step": 2040 }, { "epoch": 0.4023552502453386, "grad_norm": 14106.665384198648, "learning_rate": 3.7326529999303633e-07, "logits/chosen": -6.070862770080566, "logits/rejected": -6.0350165367126465, "logps/chosen": -4712.54052734375, "logps/rejected": -5821.7255859375, "loss": -35.5696, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -4496.8466796875, "rewards/margins": 1067.4761962890625, "rewards/rejected": -5564.3232421875, "step": 2050 }, { "epoch": 0.4043179587831207, "grad_norm": 10616.700386516994, "learning_rate": 3.7177214071962684e-07, "logits/chosen": -6.62981653213501, "logits/rejected": -6.411627292633057, "logps/chosen": -7432.15771484375, "logps/rejected": -9206.5830078125, "loss": -11.893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7179.9091796875, "rewards/margins": 1711.65625, "rewards/rejected": -8891.5654296875, "step": 2060 }, { "epoch": 0.40628066732090284, "grad_norm": 14118.458228728883, "learning_rate": 3.7027326445786835e-07, "logits/chosen": -6.226855278015137, "logits/rejected": -7.005548000335693, "logps/chosen": -6520.7607421875, "logps/rejected": -12076.251953125, "loss": -34.6365, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -6264.1015625, "rewards/margins": 5574.2880859375, "rewards/rejected": -11838.3896484375, "step": 2070 }, { "epoch": 0.408243375858685, "grad_norm": 15564.572580294667, "learning_rate": 3.6876874157737167e-07, "logits/chosen": -6.970287322998047, "logits/rejected": -7.177987575531006, "logps/chosen": -12914.744140625, "logps/rejected": -15925.8984375, "loss": -44.4124, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -12653.640625, "rewards/margins": 2987.95556640625, "rewards/rejected": -15641.59375, "step": 2080 }, { "epoch": 0.4102060843964671, "grad_norm": 34576.04513534019, "learning_rate": 3.67258642712846e-07, "logits/chosen": -6.322152137756348, "logits/rejected": -7.123676300048828, "logps/chosen": -9783.587890625, "logps/rejected": -13753.447265625, "loss": -41.4083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9530.115234375, "rewards/margins": 4025.38623046875, "rewards/rejected": -13555.501953125, "step": 2090 }, { "epoch": 0.41216879293424924, "grad_norm": 12976.310220359248, "learning_rate": 3.6574303876078366e-07, "logits/chosen": -6.044041633605957, "logits/rejected": -6.6243157386779785, "logps/chosen": -7803.0830078125, "logps/rejected": -11632.576171875, "loss": -38.7968, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7520.2646484375, "rewards/margins": 3838.70166015625, "rewards/rejected": -11358.9658203125, "step": 2100 }, { "epoch": 0.4141315014720314, "grad_norm": 5314.678922391978, "learning_rate": 3.642220008761309e-07, "logits/chosen": -6.025908470153809, "logits/rejected": -6.694465637207031, "logps/chosen": -11593.830078125, "logps/rejected": -13148.03125, "loss": -52.519, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -11266.267578125, "rewards/margins": 1594.821533203125, "rewards/rejected": -12861.087890625, "step": 2110 }, { "epoch": 0.41609421000981356, "grad_norm": 5214.576216188377, "learning_rate": 3.626956004689476e-07, "logits/chosen": -5.666697025299072, "logits/rejected": -7.252646446228027, "logps/chosen": -5951.28125, "logps/rejected": -14502.0576171875, "loss": -45.5351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5589.72265625, "rewards/margins": 8697.138671875, "rewards/rejected": -14286.8623046875, "step": 2120 }, { "epoch": 0.4180569185475957, "grad_norm": 22734.81216038372, "learning_rate": 3.6116390920105474e-07, "logits/chosen": -6.247704029083252, "logits/rejected": -6.781765937805176, "logps/chosen": -9925.275390625, "logps/rejected": -13164.9248046875, "loss": -25.2973, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9657.9462890625, "rewards/margins": 3271.98779296875, "rewards/rejected": -12929.935546875, "step": 2130 }, { "epoch": 0.4200196270853778, "grad_norm": 13230.357969123414, "learning_rate": 3.5962699898266983e-07, "logits/chosen": -6.101110935211182, "logits/rejected": -6.679749488830566, "logps/chosen": -7167.546875, "logps/rejected": -10159.4560546875, "loss": -13.6468, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -6923.52880859375, "rewards/margins": 3019.69580078125, "rewards/rejected": -9943.2255859375, "step": 2140 }, { "epoch": 0.42198233562315995, "grad_norm": 25279.94535800821, "learning_rate": 3.5808494196903117e-07, "logits/chosen": -5.892174243927002, "logits/rejected": -6.804270267486572, "logps/chosen": -5454.3828125, "logps/rejected": -12194.4091796875, "loss": -49.6637, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5128.13525390625, "rewards/margins": 6851.7109375, "rewards/rejected": -11979.84765625, "step": 2150 }, { "epoch": 0.4239450441609421, "grad_norm": 17540.865988563488, "learning_rate": 3.565378105570097e-07, "logits/chosen": -5.941842079162598, "logits/rejected": -6.358455181121826, "logps/chosen": -7152.48291015625, "logps/rejected": -7712.0234375, "loss": -23.4808, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -6868.4072265625, "rewards/margins": 636.2252807617188, "rewards/rejected": -7504.6318359375, "step": 2160 }, { "epoch": 0.4259077526987242, "grad_norm": 11878.472329071703, "learning_rate": 3.549856773817107e-07, "logits/chosen": -6.390814781188965, "logits/rejected": -7.072608947753906, "logps/chosen": -6028.4560546875, "logps/rejected": -10688.0625, "loss": -59.6464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5789.8974609375, "rewards/margins": 4682.11572265625, "rewards/rejected": -10472.0126953125, "step": 2170 }, { "epoch": 0.4278704612365064, "grad_norm": 11243.665894252028, "learning_rate": 3.5342861531306344e-07, "logits/chosen": -5.897531509399414, "logits/rejected": -6.965134620666504, "logps/chosen": -7044.66552734375, "logps/rejected": -13736.810546875, "loss": -44.7323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6790.47265625, "rewards/margins": 6730.32568359375, "rewards/rejected": -13520.798828125, "step": 2180 }, { "epoch": 0.42983316977428854, "grad_norm": 23170.290901201486, "learning_rate": 3.518666974524002e-07, "logits/chosen": -6.07274866104126, "logits/rejected": -6.793564796447754, "logps/chosen": -7941.83740234375, "logps/rejected": -14779.189453125, "loss": -50.6473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7625.8037109375, "rewards/margins": 6880.0322265625, "rewards/rejected": -14505.837890625, "step": 2190 }, { "epoch": 0.43179587831207067, "grad_norm": 54659.36540561561, "learning_rate": 3.5029999712902387e-07, "logits/chosen": -5.7945146560668945, "logits/rejected": -6.588175296783447, "logps/chosen": -7551.9560546875, "logps/rejected": -14917.3251953125, "loss": -52.4227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7222.6513671875, "rewards/margins": 7340.0517578125, "rewards/rejected": -14562.701171875, "step": 2200 }, { "epoch": 0.4337585868498528, "grad_norm": 17117.667644673973, "learning_rate": 3.4872858789676583e-07, "logits/chosen": -6.135748863220215, "logits/rejected": -6.370306491851807, "logps/chosen": -8900.66796875, "logps/rejected": -13838.0830078125, "loss": -47.3348, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8667.740234375, "rewards/margins": 4935.6162109375, "rewards/rejected": -13603.3564453125, "step": 2210 }, { "epoch": 0.43572129538763493, "grad_norm": 23128.657620138194, "learning_rate": 3.4715254353053236e-07, "logits/chosen": -6.314383506774902, "logits/rejected": -6.691563606262207, "logps/chosen": -10103.6259765625, "logps/rejected": -14022.591796875, "loss": -55.6205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -9831.693359375, "rewards/margins": 3909.940673828125, "rewards/rejected": -13741.6328125, "step": 2220 }, { "epoch": 0.43768400392541706, "grad_norm": 24843.72883460827, "learning_rate": 3.4557193802284123e-07, "logits/chosen": -6.022536754608154, "logits/rejected": -6.278723239898682, "logps/chosen": -8685.5361328125, "logps/rejected": -11235.9013671875, "loss": -24.9526, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -8420.8056640625, "rewards/margins": 2556.953857421875, "rewards/rejected": -10977.759765625, "step": 2230 }, { "epoch": 0.4396467124631992, "grad_norm": 12811.070941534039, "learning_rate": 3.4398684558034763e-07, "logits/chosen": -6.142720699310303, "logits/rejected": -6.256009101867676, "logps/chosen": -3110.73681640625, "logps/rejected": -8999.16015625, "loss": -56.048, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -2842.2607421875, "rewards/margins": 5906.97119140625, "rewards/rejected": -8749.232421875, "step": 2240 }, { "epoch": 0.44160942100098133, "grad_norm": 11827.835194286312, "learning_rate": 3.4239734062036067e-07, "logits/chosen": -5.979983806610107, "logits/rejected": -6.681007385253906, "logps/chosen": -3733.819580078125, "logps/rejected": -12031.208984375, "loss": -59.0953, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -3449.771484375, "rewards/margins": 8301.556640625, "rewards/rejected": -11751.328125, "step": 2250 }, { "epoch": 0.4435721295387635, "grad_norm": 21911.259574066502, "learning_rate": 3.4080349776734924e-07, "logits/chosen": -6.0595550537109375, "logits/rejected": -6.677829742431641, "logps/chosen": -4817.1435546875, "logps/rejected": -10647.822265625, "loss": -45.1485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4518.4814453125, "rewards/margins": 5851.7021484375, "rewards/rejected": -10370.18359375, "step": 2260 }, { "epoch": 0.44553483807654565, "grad_norm": 17760.45761412518, "learning_rate": 3.392053918494389e-07, "logits/chosen": -5.527416229248047, "logits/rejected": -6.26800537109375, "logps/chosen": -2170.3720703125, "logps/rejected": -10847.1337890625, "loss": -55.2189, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -1864.0986328125, "rewards/margins": 8712.890625, "rewards/rejected": -10576.990234375, "step": 2270 }, { "epoch": 0.4474975466143278, "grad_norm": 37400.261543322566, "learning_rate": 3.376030978948983e-07, "logits/chosen": -5.683913230895996, "logits/rejected": -6.262761116027832, "logps/chosen": -9661.52734375, "logps/rejected": -10261.224609375, "loss": -39.25, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -9339.1826171875, "rewards/margins": 634.7274169921875, "rewards/rejected": -9973.9111328125, "step": 2280 }, { "epoch": 0.4494602551521099, "grad_norm": 31974.31095075975, "learning_rate": 3.3599669112861756e-07, "logits/chosen": -6.26138973236084, "logits/rejected": -6.562506198883057, "logps/chosen": -7902.3544921875, "logps/rejected": -13649.4580078125, "loss": -41.8166, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7633.0771484375, "rewards/margins": 5724.45166015625, "rewards/rejected": -13357.529296875, "step": 2290 }, { "epoch": 0.45142296368989204, "grad_norm": 40839.04383199504, "learning_rate": 3.343862469685755e-07, "logits/chosen": -6.151627540588379, "logits/rejected": -6.493348598480225, "logps/chosen": -4104.02880859375, "logps/rejected": -7776.77880859375, "loss": -30.3878, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -3863.81201171875, "rewards/margins": 3685.8046875, "rewards/rejected": -7549.6181640625, "step": 2300 }, { "epoch": 0.4533856722276742, "grad_norm": 18805.91077532629, "learning_rate": 3.3277184102230004e-07, "logits/chosen": -5.606184959411621, "logits/rejected": -5.81091833114624, "logps/chosen": -3033.305419921875, "logps/rejected": -9306.1220703125, "loss": -31.4932, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2763.66845703125, "rewards/margins": 6268.06396484375, "rewards/rejected": -9031.7314453125, "step": 2310 }, { "epoch": 0.4553483807654563, "grad_norm": 27601.497857089784, "learning_rate": 3.311535490833176e-07, "logits/chosen": -6.179599285125732, "logits/rejected": -6.64168643951416, "logps/chosen": -6308.857421875, "logps/rejected": -10572.8466796875, "loss": -34.8395, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6059.90576171875, "rewards/margins": 4225.98779296875, "rewards/rejected": -10285.89453125, "step": 2320 }, { "epoch": 0.4573110893032385, "grad_norm": 52777.57980193355, "learning_rate": 3.2953144712759537e-07, "logits/chosen": -5.664132595062256, "logits/rejected": -6.792402744293213, "logps/chosen": -5799.3837890625, "logps/rejected": -10318.4609375, "loss": -44.46, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5456.71630859375, "rewards/margins": 4612.26220703125, "rewards/rejected": -10068.978515625, "step": 2330 }, { "epoch": 0.4592737978410206, "grad_norm": 9306.8894771613, "learning_rate": 3.279056113099742e-07, "logits/chosen": -5.553504467010498, "logits/rejected": -6.759832859039307, "logps/chosen": -3775.756591796875, "logps/rejected": -11996.390625, "loss": -47.4639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3488.231201171875, "rewards/margins": 8178.7978515625, "rewards/rejected": -11667.029296875, "step": 2340 }, { "epoch": 0.46123650637880276, "grad_norm": 16590.12817904555, "learning_rate": 3.2627611796059283e-07, "logits/chosen": -5.9032697677612305, "logits/rejected": -6.455768585205078, "logps/chosen": -5309.560546875, "logps/rejected": -8784.9228515625, "loss": -34.568, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -5036.5908203125, "rewards/margins": 3509.909423828125, "rewards/rejected": -8546.5, "step": 2350 }, { "epoch": 0.4631992149165849, "grad_norm": 13221.726151444313, "learning_rate": 3.246430435813051e-07, "logits/chosen": -6.2327446937561035, "logits/rejected": -6.560967922210693, "logps/chosen": -7888.08056640625, "logps/rejected": -8374.0146484375, "loss": -34.9453, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -7614.4130859375, "rewards/margins": 541.7490234375, "rewards/rejected": -8156.16259765625, "step": 2360 }, { "epoch": 0.465161923454367, "grad_norm": 21991.579564439944, "learning_rate": 3.230064648420878e-07, "logits/chosen": -6.088255405426025, "logits/rejected": -6.7369184494018555, "logps/chosen": -8426.826171875, "logps/rejected": -7148.10546875, "loss": -11.2355, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -8150.1279296875, "rewards/margins": -1194.8394775390625, "rewards/rejected": -6955.2890625, "step": 2370 }, { "epoch": 0.46712463199214915, "grad_norm": 16444.38654680433, "learning_rate": 3.2136645857744114e-07, "logits/chosen": -6.368856906890869, "logits/rejected": -6.475495338439941, "logps/chosen": -7366.2958984375, "logps/rejected": -11657.75390625, "loss": -45.3706, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -7133.77734375, "rewards/margins": 4246.9892578125, "rewards/rejected": -11380.767578125, "step": 2380 }, { "epoch": 0.4690873405299313, "grad_norm": 10175.689773101021, "learning_rate": 3.197231017827818e-07, "logits/chosen": -5.991966247558594, "logits/rejected": -6.515161991119385, "logps/chosen": -5335.4404296875, "logps/rejected": -10783.853515625, "loss": -40.7891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5053.935546875, "rewards/margins": 5468.75927734375, "rewards/rejected": -10522.6953125, "step": 2390 }, { "epoch": 0.47105004906771347, "grad_norm": 54264.794299396104, "learning_rate": 3.1807647161082797e-07, "logits/chosen": -6.792760372161865, "logits/rejected": -6.9702301025390625, "logps/chosen": -13532.9423828125, "logps/rejected": -11839.1123046875, "loss": -38.1973, "rewards/accuracies": 0.5, "rewards/chosen": -13258.138671875, "rewards/margins": -1676.642822265625, "rewards/rejected": -11581.4951171875, "step": 2400 }, { "epoch": 0.4730127576054956, "grad_norm": 16367.658019817267, "learning_rate": 3.1642664536797693e-07, "logits/chosen": -6.431081295013428, "logits/rejected": -6.521323204040527, "logps/chosen": -10428.35546875, "logps/rejected": -10706.857421875, "loss": -24.2161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10155.3583984375, "rewards/margins": 277.151123046875, "rewards/rejected": -10432.509765625, "step": 2410 }, { "epoch": 0.47497546614327774, "grad_norm": 31204.714310351377, "learning_rate": 3.147737005106762e-07, "logits/chosen": -6.890205383300781, "logits/rejected": -7.143877983093262, "logps/chosen": -11078.3154296875, "logps/rejected": -20215.22265625, "loss": -52.6612, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -10779.275390625, "rewards/margins": 9161.294921875, "rewards/rejected": -19940.568359375, "step": 2420 }, { "epoch": 0.47693817468105987, "grad_norm": 11740.096087059512, "learning_rate": 3.1311771464178655e-07, "logits/chosen": -6.378174781799316, "logits/rejected": -6.777315616607666, "logps/chosen": -11051.1083984375, "logps/rejected": -12446.439453125, "loss": -43.3986, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -10755.8603515625, "rewards/margins": 1474.361083984375, "rewards/rejected": -12230.220703125, "step": 2430 }, { "epoch": 0.478900883218842, "grad_norm": 7733.746306582557, "learning_rate": 3.1145876550693893e-07, "logits/chosen": -6.233151912689209, "logits/rejected": -6.735576629638672, "logps/chosen": -11342.470703125, "logps/rejected": -14228.3046875, "loss": -34.8307, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -11056.720703125, "rewards/margins": 2937.610595703125, "rewards/rejected": -13994.3310546875, "step": 2440 }, { "epoch": 0.48086359175662413, "grad_norm": 52768.760399582236, "learning_rate": 3.097969309908847e-07, "logits/chosen": -6.380805492401123, "logits/rejected": -6.4574151039123535, "logps/chosen": -8266.755859375, "logps/rejected": -6937.92431640625, "loss": -33.4903, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -8027.54296875, "rewards/margins": -1296.3603515625, "rewards/rejected": -6731.1826171875, "step": 2450 }, { "epoch": 0.48282630029440626, "grad_norm": 37565.77205732285, "learning_rate": 3.081322891138382e-07, "logits/chosen": -5.641980171203613, "logits/rejected": -5.83804178237915, "logps/chosen": -3776.92041015625, "logps/rejected": -8471.234375, "loss": -35.034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3471.74169921875, "rewards/margins": 4737.46435546875, "rewards/rejected": -8209.2060546875, "step": 2460 }, { "epoch": 0.4847890088321884, "grad_norm": 18903.98569528581, "learning_rate": 3.0646491802781514e-07, "logits/chosen": -6.12729549407959, "logits/rejected": -6.105202674865723, "logps/chosen": -8582.6865234375, "logps/rejected": -4177.748046875, "loss": -33.0833, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8316.0419921875, "rewards/margins": -4318.7705078125, "rewards/rejected": -3997.270263671875, "step": 2470 }, { "epoch": 0.4867517173699706, "grad_norm": 17388.46450989318, "learning_rate": 3.047948960129624e-07, "logits/chosen": -5.708449363708496, "logits/rejected": -6.1740007400512695, "logps/chosen": -2852.746337890625, "logps/rejected": -10738.833984375, "loss": -38.4425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2643.248779296875, "rewards/margins": 7878.05078125, "rewards/rejected": -10521.2998046875, "step": 2480 }, { "epoch": 0.4887144259077527, "grad_norm": 22463.93529400386, "learning_rate": 3.0312230147388334e-07, "logits/chosen": -5.58059024810791, "logits/rejected": -6.7109527587890625, "logps/chosen": -3980.739501953125, "logps/rejected": -12892.0, "loss": -59.395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3671.518798828125, "rewards/margins": 8913.244140625, "rewards/rejected": -12584.763671875, "step": 2490 }, { "epoch": 0.49067713444553485, "grad_norm": 15002.758086169333, "learning_rate": 3.01447212935957e-07, "logits/chosen": -5.638707160949707, "logits/rejected": -6.036099433898926, "logps/chosen": -3293.352783203125, "logps/rejected": -7819.3720703125, "loss": -61.4311, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -3066.904541015625, "rewards/margins": 4501.4189453125, "rewards/rejected": -7568.3232421875, "step": 2500 }, { "epoch": 0.492639842983317, "grad_norm": 18711.676910439175, "learning_rate": 2.9976970904165104e-07, "logits/chosen": -5.423633575439453, "logits/rejected": -6.370456218719482, "logps/chosen": -5621.875, "logps/rejected": -13417.5654296875, "loss": -48.937, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -5236.93017578125, "rewards/margins": 7885.9521484375, "rewards/rejected": -13122.8828125, "step": 2510 }, { "epoch": 0.4946025515210991, "grad_norm": 16158.430916723883, "learning_rate": 2.980898685468301e-07, "logits/chosen": -6.011282920837402, "logits/rejected": -6.57055139541626, "logps/chosen": -8695.154296875, "logps/rejected": -9393.2177734375, "loss": -43.5711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8395.5126953125, "rewards/margins": 764.98046875, "rewards/rejected": -9160.4921875, "step": 2520 }, { "epoch": 0.49656526005888124, "grad_norm": 20386.98953734317, "learning_rate": 2.96407770317058e-07, "logits/chosen": -6.262847423553467, "logits/rejected": -6.793898582458496, "logps/chosen": -6870.58740234375, "logps/rejected": -11639.966796875, "loss": -33.3231, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -6662.18994140625, "rewards/margins": 4769.36865234375, "rewards/rejected": -11431.5576171875, "step": 2530 }, { "epoch": 0.4985279685966634, "grad_norm": 3879.935648445349, "learning_rate": 2.9472349332389523e-07, "logits/chosen": -6.527504920959473, "logits/rejected": -7.152581214904785, "logps/chosen": -9698.3818359375, "logps/rejected": -11432.53515625, "loss": -41.3343, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -9405.0078125, "rewards/margins": 1827.9716796875, "rewards/rejected": -11232.978515625, "step": 2540 }, { "epoch": 0.5004906771344455, "grad_norm": 24298.10752817062, "learning_rate": 2.930371166411915e-07, "logits/chosen": -6.064120292663574, "logits/rejected": -6.721309661865234, "logps/chosen": -10091.9873046875, "logps/rejected": -12400.7763671875, "loss": -46.3356, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -9777.216796875, "rewards/margins": 2311.11865234375, "rewards/rejected": -12088.3359375, "step": 2550 }, { "epoch": 0.5024533856722276, "grad_norm": 11943.816253956084, "learning_rate": 2.913487194413731e-07, "logits/chosen": -5.539381980895996, "logits/rejected": -6.003456115722656, "logps/chosen": -4747.74755859375, "logps/rejected": -9473.9482421875, "loss": -30.455, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -4465.08544921875, "rewards/margins": 4729.84912109375, "rewards/rejected": -9194.935546875, "step": 2560 }, { "epoch": 0.5044160942100098, "grad_norm": 10734.420672958464, "learning_rate": 2.896583809917262e-07, "logits/chosen": -6.517728328704834, "logits/rejected": -6.652238368988037, "logps/chosen": -6882.8466796875, "logps/rejected": -9052.439453125, "loss": -24.9567, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -6658.3466796875, "rewards/margins": 2169.07568359375, "rewards/rejected": -8827.4228515625, "step": 2570 }, { "epoch": 0.5063788027477919, "grad_norm": 33065.06702271708, "learning_rate": 2.879661806506751e-07, "logits/chosen": -6.095475196838379, "logits/rejected": -6.364386558532715, "logps/chosen": -6472.1513671875, "logps/rejected": -8292.279296875, "loss": -29.0089, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6199.2109375, "rewards/margins": 1783.4049072265625, "rewards/rejected": -7982.61572265625, "step": 2580 }, { "epoch": 0.5083415112855741, "grad_norm": 17594.515070458732, "learning_rate": 2.86272197864057e-07, "logits/chosen": -5.363958835601807, "logits/rejected": -6.321815013885498, "logps/chosen": -3914.507080078125, "logps/rejected": -12397.724609375, "loss": -38.8625, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -3593.971435546875, "rewards/margins": 8549.591796875, "rewards/rejected": -12143.564453125, "step": 2590 }, { "epoch": 0.5103042198233563, "grad_norm": 13413.45965820753, "learning_rate": 2.845765121613912e-07, "logits/chosen": -5.317925930023193, "logits/rejected": -6.1297287940979, "logps/chosen": -2945.64892578125, "logps/rejected": -8846.72265625, "loss": -62.6653, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -2650.088623046875, "rewards/margins": 5997.791015625, "rewards/rejected": -8647.880859375, "step": 2600 }, { "epoch": 0.5122669283611384, "grad_norm": 13296.957729194994, "learning_rate": 2.828792031521464e-07, "logits/chosen": -5.375241279602051, "logits/rejected": -6.6168341636657715, "logps/chosen": -4744.81591796875, "logps/rejected": -14173.826171875, "loss": -51.0486, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -4430.0498046875, "rewards/margins": 9448.6142578125, "rewards/rejected": -13878.666015625, "step": 2610 }, { "epoch": 0.5142296368989205, "grad_norm": 33574.966167589526, "learning_rate": 2.811803505220025e-07, "logits/chosen": -5.833651542663574, "logits/rejected": -6.36823844909668, "logps/chosen": -8814.9169921875, "logps/rejected": -9170.84375, "loss": -20.2881, "rewards/accuracies": 0.5, "rewards/chosen": -8551.40234375, "rewards/margins": 411.74822998046875, "rewards/rejected": -8963.150390625, "step": 2620 }, { "epoch": 0.5161923454367027, "grad_norm": 12585.2915198008, "learning_rate": 2.7948003402910975e-07, "logits/chosen": -6.554004669189453, "logits/rejected": -7.095139503479004, "logps/chosen": -10092.1953125, "logps/rejected": -11927.392578125, "loss": -34.5027, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -9794.150390625, "rewards/margins": 1879.41796875, "rewards/rejected": -11673.568359375, "step": 2630 }, { "epoch": 0.5181550539744848, "grad_norm": 10418.89064361854, "learning_rate": 2.777783335003442e-07, "logits/chosen": -5.868377208709717, "logits/rejected": -6.875641822814941, "logps/chosen": -5577.24267578125, "logps/rejected": -10460.369140625, "loss": -16.7983, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -5265.66650390625, "rewards/margins": 4934.72021484375, "rewards/rejected": -10200.3876953125, "step": 2640 }, { "epoch": 0.5201177625122669, "grad_norm": 16733.946320231044, "learning_rate": 2.760753288275598e-07, "logits/chosen": -5.924532890319824, "logits/rejected": -6.515267848968506, "logps/chosen": -7140.69921875, "logps/rejected": -9910.255859375, "loss": -28.087, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -6884.3251953125, "rewards/margins": 2773.028076171875, "rewards/rejected": -9657.353515625, "step": 2650 }, { "epoch": 0.5220804710500491, "grad_norm": 23976.182005709612, "learning_rate": 2.7437109996383795e-07, "logits/chosen": -5.747305870056152, "logits/rejected": -6.309284687042236, "logps/chosen": -5362.365234375, "logps/rejected": -7944.8662109375, "loss": -34.9118, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -5106.931640625, "rewards/margins": 2627.732177734375, "rewards/rejected": -7734.6640625, "step": 2660 }, { "epoch": 0.5240431795878312, "grad_norm": 20039.924367229363, "learning_rate": 2.7266572691973365e-07, "logits/chosen": -5.86647891998291, "logits/rejected": -6.187295436859131, "logps/chosen": -8747.4462890625, "logps/rejected": -10428.939453125, "loss": -47.6488, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -8413.8056640625, "rewards/margins": 1738.4195556640625, "rewards/rejected": -10152.2265625, "step": 2670 }, { "epoch": 0.5260058881256133, "grad_norm": 12005.973631187191, "learning_rate": 2.709592897595191e-07, "logits/chosen": -5.817458152770996, "logits/rejected": -6.226616859436035, "logps/chosen": -3584.112060546875, "logps/rejected": -5519.8154296875, "loss": -30.2079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3308.314453125, "rewards/margins": 2025.8785400390625, "rewards/rejected": -5334.193359375, "step": 2680 }, { "epoch": 0.5279685966633955, "grad_norm": 30028.096117077403, "learning_rate": 2.6925186859742494e-07, "logits/chosen": -6.055359840393066, "logits/rejected": -6.1964826583862305, "logps/chosen": -6304.94775390625, "logps/rejected": -11084.908203125, "loss": -36.7132, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -6047.52685546875, "rewards/margins": 4809.43359375, "rewards/rejected": -10856.9599609375, "step": 2690 }, { "epoch": 0.5299313052011776, "grad_norm": 11136.881550252963, "learning_rate": 2.675435435938788e-07, "logits/chosen": -5.943601608276367, "logits/rejected": -6.616503715515137, "logps/chosen": -7073.62744140625, "logps/rejected": -11624.15625, "loss": -34.4878, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -6758.0166015625, "rewards/margins": 4610.0634765625, "rewards/rejected": -11368.0791015625, "step": 2700 }, { "epoch": 0.5318940137389597, "grad_norm": 25178.681348085753, "learning_rate": 2.6583439495174247e-07, "logits/chosen": -6.088890552520752, "logits/rejected": -6.231939315795898, "logps/chosen": -9590.7861328125, "logps/rejected": -9357.4892578125, "loss": -27.5907, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -9299.8857421875, "rewards/margins": -182.4521484375, "rewards/rejected": -9117.4326171875, "step": 2710 }, { "epoch": 0.5338567222767419, "grad_norm": 18835.06547811897, "learning_rate": 2.6412450291254564e-07, "logits/chosen": -5.743609428405762, "logits/rejected": -6.17622709274292, "logps/chosen": -3307.915283203125, "logps/rejected": -8732.0126953125, "loss": -46.6126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3016.544921875, "rewards/margins": 5474.3740234375, "rewards/rejected": -8490.919921875, "step": 2720 }, { "epoch": 0.535819430814524, "grad_norm": 11035.242722119494, "learning_rate": 2.6241394775271954e-07, "logits/chosen": -6.31672477722168, "logits/rejected": -6.637670040130615, "logps/chosen": -7725.55859375, "logps/rejected": -8656.6337890625, "loss": -29.402, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -7465.0322265625, "rewards/margins": 962.5202026367188, "rewards/rejected": -8427.552734375, "step": 2730 }, { "epoch": 0.5377821393523062, "grad_norm": 10456.015681407709, "learning_rate": 2.607028097798276e-07, "logits/chosen": -5.451465129852295, "logits/rejected": -6.194781303405762, "logps/chosen": -4029.79248046875, "logps/rejected": -10183.9404296875, "loss": -56.4387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3743.822265625, "rewards/margins": 6147.2880859375, "rewards/rejected": -9891.1103515625, "step": 2740 }, { "epoch": 0.5397448478900884, "grad_norm": 24979.30946157041, "learning_rate": 2.5899116932879534e-07, "logits/chosen": -6.051209926605225, "logits/rejected": -6.451709747314453, "logps/chosen": -5342.54638671875, "logps/rejected": -8331.91015625, "loss": -58.6434, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -5142.64892578125, "rewards/margins": 2980.75927734375, "rewards/rejected": -8123.4091796875, "step": 2750 }, { "epoch": 0.5417075564278705, "grad_norm": 18128.688887041062, "learning_rate": 2.5727910675813866e-07, "logits/chosen": -5.9375319480896, "logits/rejected": -6.503568172454834, "logps/chosen": -5812.185546875, "logps/rejected": -12971.08984375, "loss": -48.012, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -5559.2744140625, "rewards/margins": 7139.5205078125, "rewards/rejected": -12698.7958984375, "step": 2760 }, { "epoch": 0.5436702649656526, "grad_norm": 19166.378723015816, "learning_rate": 2.555667024461915e-07, "logits/chosen": -6.17089319229126, "logits/rejected": -6.559246063232422, "logps/chosen": -5490.09912109375, "logps/rejected": -11935.837890625, "loss": -41.5186, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -5257.4560546875, "rewards/margins": 6412.44384765625, "rewards/rejected": -11669.900390625, "step": 2770 }, { "epoch": 0.5456329735034348, "grad_norm": 32852.85119975119, "learning_rate": 2.5385403678733157e-07, "logits/chosen": -6.6432647705078125, "logits/rejected": -6.7126054763793945, "logps/chosen": -11160.3837890625, "logps/rejected": -12538.3642578125, "loss": -32.6515, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -10961.1875, "rewards/margins": 1345.627685546875, "rewards/rejected": -12306.8154296875, "step": 2780 }, { "epoch": 0.5475956820412169, "grad_norm": 10830.353928303128, "learning_rate": 2.521411901882067e-07, "logits/chosen": -6.252813339233398, "logits/rejected": -6.589575290679932, "logps/chosen": -10153.9697265625, "logps/rejected": -12538.3125, "loss": -44.7746, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9891.1650390625, "rewards/margins": 2441.12158203125, "rewards/rejected": -12332.2861328125, "step": 2790 }, { "epoch": 0.549558390578999, "grad_norm": 27035.949757303744, "learning_rate": 2.504282430639594e-07, "logits/chosen": -5.5261664390563965, "logits/rejected": -6.288464069366455, "logps/chosen": -6266.47412109375, "logps/rejected": -11536.119140625, "loss": -58.0714, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -6058.314453125, "rewards/margins": 5273.22021484375, "rewards/rejected": -11331.5361328125, "step": 2800 }, { "epoch": 0.5515210991167812, "grad_norm": 33690.556775017176, "learning_rate": 2.4871527583445163e-07, "logits/chosen": -6.1002912521362305, "logits/rejected": -6.546652317047119, "logps/chosen": -8360.79296875, "logps/rejected": -10610.951171875, "loss": -34.0981, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": -8051.3642578125, "rewards/margins": 2286.067626953125, "rewards/rejected": -10337.4306640625, "step": 2810 }, { "epoch": 0.5534838076545633, "grad_norm": 15246.564066283772, "learning_rate": 2.470023689204893e-07, "logits/chosen": -6.285436153411865, "logits/rejected": -6.436059474945068, "logps/chosen": -10982.6376953125, "logps/rejected": -11920.1044921875, "loss": -34.2879, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -10688.08203125, "rewards/margins": 952.1345825195312, "rewards/rejected": -11640.216796875, "step": 2820 }, { "epoch": 0.5554465161923454, "grad_norm": 13817.5152125985, "learning_rate": 2.452896027400465e-07, "logits/chosen": -5.8990373611450195, "logits/rejected": -6.614035129547119, "logps/chosen": -7396.47412109375, "logps/rejected": -11294.685546875, "loss": -46.6119, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7115.9072265625, "rewards/margins": 3896.42578125, "rewards/rejected": -11012.33203125, "step": 2830 }, { "epoch": 0.5574092247301276, "grad_norm": 21413.80465054335, "learning_rate": 2.4357705770449046e-07, "logits/chosen": -5.740229606628418, "logits/rejected": -6.4465532302856445, "logps/chosen": -2700.2880859375, "logps/rejected": -10101.0986328125, "loss": -43.8914, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -2450.4267578125, "rewards/margins": 7413.0341796875, "rewards/rejected": -9863.4599609375, "step": 2840 }, { "epoch": 0.5593719332679097, "grad_norm": 17562.590939454076, "learning_rate": 2.418648142148056e-07, "logits/chosen": -6.154690742492676, "logits/rejected": -6.859691619873047, "logps/chosen": -3651.078857421875, "logps/rejected": -7937.0029296875, "loss": -44.2833, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -3367.26904296875, "rewards/margins": 4352.5908203125, "rewards/rejected": -7719.8603515625, "step": 2850 }, { "epoch": 0.5613346418056918, "grad_norm": 13240.395691118403, "learning_rate": 2.4015295265781966e-07, "logits/chosen": -5.901522159576416, "logits/rejected": -6.384616851806641, "logps/chosen": -5881.0244140625, "logps/rejected": -13143.9609375, "loss": -60.2603, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -5582.77587890625, "rewards/margins": 7252.5478515625, "rewards/rejected": -12835.32421875, "step": 2860 }, { "epoch": 0.563297350343474, "grad_norm": 34206.18306932717, "learning_rate": 2.3844155340242893e-07, "logits/chosen": -6.090862274169922, "logits/rejected": -6.095606803894043, "logps/chosen": -8231.333984375, "logps/rejected": -8128.671875, "loss": -36.4379, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": -8015.18994140625, "rewards/margins": -105.35820007324219, "rewards/rejected": -7909.83203125, "step": 2870 }, { "epoch": 0.5652600588812562, "grad_norm": 17059.90117071475, "learning_rate": 2.36730696795826e-07, "logits/chosen": -6.123776912689209, "logits/rejected": -5.706298828125, "logps/chosen": -9483.287109375, "logps/rejected": -6523.83203125, "loss": -37.9418, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9243.080078125, "rewards/margins": -3055.062255859375, "rewards/rejected": -6188.01708984375, "step": 2880 }, { "epoch": 0.5672227674190383, "grad_norm": 43326.867641306664, "learning_rate": 2.3502046315972655e-07, "logits/chosen": -5.902927875518799, "logits/rejected": -6.30301570892334, "logps/chosen": -7401.2041015625, "logps/rejected": -8702.8212890625, "loss": -35.4629, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -7082.4560546875, "rewards/margins": 1326.578369140625, "rewards/rejected": -8409.0341796875, "step": 2890 }, { "epoch": 0.5691854759568205, "grad_norm": 29862.62116556775, "learning_rate": 2.3331093278659906e-07, "logits/chosen": -5.886011600494385, "logits/rejected": -6.371966361999512, "logps/chosen": -6040.6962890625, "logps/rejected": -10261.93359375, "loss": -42.5762, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -5732.1044921875, "rewards/margins": 4245.67236328125, "rewards/rejected": -9977.77734375, "step": 2900 }, { "epoch": 0.5711481844946026, "grad_norm": 29374.859558796845, "learning_rate": 2.31602185935895e-07, "logits/chosen": -6.060447692871094, "logits/rejected": -6.565158843994141, "logps/chosen": -8422.4423828125, "logps/rejected": -11812.509765625, "loss": -58.7072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8138.63427734375, "rewards/margins": 3450.626220703125, "rewards/rejected": -11589.259765625, "step": 2910 }, { "epoch": 0.5731108930323847, "grad_norm": 32547.86978902682, "learning_rate": 2.298943028302811e-07, "logits/chosen": -5.83259391784668, "logits/rejected": -6.5215349197387695, "logps/chosen": -12311.7412109375, "logps/rejected": -19372.44140625, "loss": -40.0765, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -12022.2646484375, "rewards/margins": 7038.92041015625, "rewards/rejected": -19061.18359375, "step": 2920 }, { "epoch": 0.5750736015701668, "grad_norm": 2060.9821524550994, "learning_rate": 2.2818736365187242e-07, "logits/chosen": -5.577447414398193, "logits/rejected": -6.101305961608887, "logps/chosen": -6291.8486328125, "logps/rejected": -10588.484375, "loss": -55.9533, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -6064.76123046875, "rewards/margins": 4331.734375, "rewards/rejected": -10396.49609375, "step": 2930 }, { "epoch": 0.577036310107949, "grad_norm": 25822.211335244916, "learning_rate": 2.2648144853846847e-07, "logits/chosen": -5.980238914489746, "logits/rejected": -5.969510078430176, "logps/chosen": -5808.5185546875, "logps/rejected": -7143.390625, "loss": -42.3305, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -5549.3349609375, "rewards/margins": 1328.544677734375, "rewards/rejected": -6877.8798828125, "step": 2940 }, { "epoch": 0.5789990186457311, "grad_norm": 24655.1856219997, "learning_rate": 2.247766375797906e-07, "logits/chosen": -6.019628047943115, "logits/rejected": -5.976606845855713, "logps/chosen": -5347.34716796875, "logps/rejected": -8930.677734375, "loss": -41.2566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5155.9521484375, "rewards/margins": 3569.93798828125, "rewards/rejected": -8725.890625, "step": 2950 }, { "epoch": 0.5809617271835132, "grad_norm": 16368.527775909024, "learning_rate": 2.2307301081372222e-07, "logits/chosen": -5.832452774047852, "logits/rejected": -6.198973655700684, "logps/chosen": -4539.05322265625, "logps/rejected": -10474.8701171875, "loss": -42.4044, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -4281.1552734375, "rewards/margins": 5917.89404296875, "rewards/rejected": -10199.048828125, "step": 2960 }, { "epoch": 0.5829244357212954, "grad_norm": 42662.13732363794, "learning_rate": 2.2137064822255086e-07, "logits/chosen": -6.167731761932373, "logits/rejected": -6.236588954925537, "logps/chosen": -11024.490234375, "logps/rejected": -9996.1015625, "loss": -33.0686, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -10803.732421875, "rewards/margins": -1023.5994873046875, "rewards/rejected": -9780.1328125, "step": 2970 }, { "epoch": 0.5848871442590775, "grad_norm": 32726.995020163744, "learning_rate": 2.1966962972921322e-07, "logits/chosen": -6.197951793670654, "logits/rejected": -6.607583045959473, "logps/chosen": -11667.0986328125, "logps/rejected": -16752.43359375, "loss": -73.6322, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -11430.2578125, "rewards/margins": 5052.4306640625, "rewards/rejected": -16482.6875, "step": 2980 }, { "epoch": 0.5868498527968596, "grad_norm": 21065.846155098126, "learning_rate": 2.1797003519354285e-07, "logits/chosen": -6.102856159210205, "logits/rejected": -6.160183906555176, "logps/chosen": -9727.7099609375, "logps/rejected": -14681.419921875, "loss": -50.3626, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9465.8388671875, "rewards/margins": 4952.23681640625, "rewards/rejected": -14418.0751953125, "step": 2990 }, { "epoch": 0.5888125613346418, "grad_norm": 14642.039539393882, "learning_rate": 2.1627194440852142e-07, "logits/chosen": -6.262144088745117, "logits/rejected": -6.25952672958374, "logps/chosen": -7546.4482421875, "logps/rejected": -11042.203125, "loss": -32.7043, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": -7252.67333984375, "rewards/margins": 3531.340576171875, "rewards/rejected": -10784.013671875, "step": 3000 }, { "epoch": 0.5907752698724239, "grad_norm": 20645.896280922672, "learning_rate": 2.1457543709653176e-07, "logits/chosen": -5.806201457977295, "logits/rejected": -6.2418341636657715, "logps/chosen": -9375.548828125, "logps/rejected": -15415.654296875, "loss": -15.2684, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9101.755859375, "rewards/margins": 6062.15478515625, "rewards/rejected": -15163.9111328125, "step": 3010 }, { "epoch": 0.592737978410206, "grad_norm": 25970.565533259738, "learning_rate": 2.128805929056154e-07, "logits/chosen": -5.703097343444824, "logits/rejected": -5.557769298553467, "logps/chosen": -5550.19873046875, "logps/rejected": -8762.3837890625, "loss": -40.5735, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5369.36376953125, "rewards/margins": 3198.438232421875, "rewards/rejected": -8567.802734375, "step": 3020 }, { "epoch": 0.5947006869479883, "grad_norm": 31724.7511015446, "learning_rate": 2.1118749140573358e-07, "logits/chosen": -5.512729644775391, "logits/rejected": -6.430905818939209, "logps/chosen": -6663.1103515625, "logps/rejected": -8712.017578125, "loss": -50.5914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6408.951171875, "rewards/margins": 2051.845703125, "rewards/rejected": -8460.796875, "step": 3030 }, { "epoch": 0.5966633954857704, "grad_norm": 20683.459471901064, "learning_rate": 2.0949621208503092e-07, "logits/chosen": -5.876718997955322, "logits/rejected": -6.307151794433594, "logps/chosen": -5537.3876953125, "logps/rejected": -7440.6279296875, "loss": -9.4699, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -5236.66650390625, "rewards/margins": 1982.4974365234375, "rewards/rejected": -7219.1630859375, "step": 3040 }, { "epoch": 0.5986261040235525, "grad_norm": 7846.410710642343, "learning_rate": 2.0780683434610413e-07, "logits/chosen": -6.061627388000488, "logits/rejected": -6.230423450469971, "logps/chosen": -8826.4599609375, "logps/rejected": -12277.615234375, "loss": -46.5321, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -8586.0126953125, "rewards/margins": 3436.86767578125, "rewards/rejected": -12022.8798828125, "step": 3050 }, { "epoch": 0.6005888125613347, "grad_norm": 11560.571401866402, "learning_rate": 2.0611943750227375e-07, "logits/chosen": -6.100982666015625, "logits/rejected": -6.434123992919922, "logps/chosen": -7153.65380859375, "logps/rejected": -11196.271484375, "loss": -44.7029, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6891.0576171875, "rewards/margins": 4061.989501953125, "rewards/rejected": -10953.046875, "step": 3060 }, { "epoch": 0.6025515210991168, "grad_norm": 25252.639920178066, "learning_rate": 2.044341007738612e-07, "logits/chosen": -5.304543495178223, "logits/rejected": -6.203813076019287, "logps/chosen": -5856.3046875, "logps/rejected": -10055.4853515625, "loss": -13.3827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5505.44140625, "rewards/margins": 4252.4677734375, "rewards/rejected": -9757.9091796875, "step": 3070 }, { "epoch": 0.6045142296368989, "grad_norm": 15962.537149912641, "learning_rate": 2.027509032844687e-07, "logits/chosen": -5.996480941772461, "logits/rejected": -5.857324600219727, "logps/chosen": -9649.64453125, "logps/rejected": -10357.9638671875, "loss": -35.3732, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -9329.5810546875, "rewards/margins": 676.6570434570312, "rewards/rejected": -10006.2392578125, "step": 3080 }, { "epoch": 0.6064769381746811, "grad_norm": 20902.216724376667, "learning_rate": 2.010699240572651e-07, "logits/chosen": -6.003744125366211, "logits/rejected": -6.276988506317139, "logps/chosen": -7220.1455078125, "logps/rejected": -11391.1689453125, "loss": -41.0877, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -6861.19384765625, "rewards/margins": 4204.9052734375, "rewards/rejected": -11066.0986328125, "step": 3090 }, { "epoch": 0.6084396467124632, "grad_norm": 46731.140235376915, "learning_rate": 1.993912420112756e-07, "logits/chosen": -6.194393157958984, "logits/rejected": -6.91534948348999, "logps/chosen": -6294.78515625, "logps/rejected": -17475.001953125, "loss": -64.9345, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -5971.33984375, "rewards/margins": 11119.5146484375, "rewards/rejected": -17090.85546875, "step": 3100 }, { "epoch": 0.6104023552502453, "grad_norm": 17387.051853761066, "learning_rate": 1.9771493595767707e-07, "logits/chosen": -5.873591423034668, "logits/rejected": -6.414576053619385, "logps/chosen": -6968.84765625, "logps/rejected": -12852.6943359375, "loss": -51.6405, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6683.16552734375, "rewards/margins": 5842.7548828125, "rewards/rejected": -12525.919921875, "step": 3110 }, { "epoch": 0.6123650637880275, "grad_norm": 22634.01933915312, "learning_rate": 1.9604108459609752e-07, "logits/chosen": -6.6660966873168945, "logits/rejected": -6.5701704025268555, "logps/chosen": -10690.140625, "logps/rejected": -13625.5673828125, "loss": -28.145, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -10370.3828125, "rewards/margins": 2937.550537109375, "rewards/rejected": -13307.9326171875, "step": 3120 }, { "epoch": 0.6143277723258096, "grad_norm": 9747.650877233733, "learning_rate": 1.9436976651092142e-07, "logits/chosen": -5.810235500335693, "logits/rejected": -6.1566386222839355, "logps/chosen": -7350.38037109375, "logps/rejected": -10248.6015625, "loss": -48.291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7041.6875, "rewards/margins": 2907.492919921875, "rewards/rejected": -9949.1806640625, "step": 3130 }, { "epoch": 0.6162904808635917, "grad_norm": 30236.333496415704, "learning_rate": 1.9270106016760035e-07, "logits/chosen": -5.480429172515869, "logits/rejected": -6.225604057312012, "logps/chosen": -6119.3837890625, "logps/rejected": -12521.4765625, "loss": -26.1303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5855.6005859375, "rewards/margins": 6393.10546875, "rewards/rejected": -12248.70703125, "step": 3140 }, { "epoch": 0.6182531894013739, "grad_norm": 27847.069959914832, "learning_rate": 1.9103504390896944e-07, "logits/chosen": -6.092742443084717, "logits/rejected": -6.282870292663574, "logps/chosen": -7724.9345703125, "logps/rejected": -11288.4189453125, "loss": -62.7217, "rewards/accuracies": 0.5, "rewards/chosen": -7501.0439453125, "rewards/margins": 3501.46630859375, "rewards/rejected": -11002.509765625, "step": 3150 }, { "epoch": 0.620215897939156, "grad_norm": 15717.876426378978, "learning_rate": 1.8937179595156876e-07, "logits/chosen": -5.727993488311768, "logits/rejected": -6.502285003662109, "logps/chosen": -8314.7880859375, "logps/rejected": -12213.6162109375, "loss": -64.8583, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -8035.83056640625, "rewards/margins": 3968.806640625, "rewards/rejected": -12004.63671875, "step": 3160 }, { "epoch": 0.6221786064769381, "grad_norm": 21708.62263024755, "learning_rate": 1.8771139438197168e-07, "logits/chosen": -5.832732200622559, "logits/rejected": -6.131331443786621, "logps/chosen": -3498.958984375, "logps/rejected": -5993.869140625, "loss": -20.8637, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -3206.81005859375, "rewards/margins": 2487.585693359375, "rewards/rejected": -5694.39599609375, "step": 3170 }, { "epoch": 0.6241413150147204, "grad_norm": 4352.322528867045, "learning_rate": 1.8605391715311846e-07, "logits/chosen": -5.135312080383301, "logits/rejected": -5.9587907791137695, "logps/chosen": -4988.3896484375, "logps/rejected": -6992.62646484375, "loss": -50.3464, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": -4703.3408203125, "rewards/margins": 2082.06884765625, "rewards/rejected": -6785.4111328125, "step": 3180 }, { "epoch": 0.6261040235525025, "grad_norm": 26492.25398112592, "learning_rate": 1.8439944208065704e-07, "logits/chosen": -5.540990352630615, "logits/rejected": -6.210555076599121, "logps/chosen": -4282.52587890625, "logps/rejected": -12876.861328125, "loss": -62.7938, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -3932.91162109375, "rewards/margins": 8617.396484375, "rewards/rejected": -12550.30859375, "step": 3190 }, { "epoch": 0.6280667320902846, "grad_norm": 24481.56078018236, "learning_rate": 1.8274804683928913e-07, "logits/chosen": -5.726921081542969, "logits/rejected": -6.83974552154541, "logps/chosen": -7953.31494140625, "logps/rejected": -14902.912109375, "loss": -48.2978, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -7620.45166015625, "rewards/margins": 6998.75927734375, "rewards/rejected": -14619.208984375, "step": 3200 }, { "epoch": 0.6300294406280668, "grad_norm": 29545.176315884106, "learning_rate": 1.810998089591238e-07, "logits/chosen": -5.543051242828369, "logits/rejected": -6.097823143005371, "logps/chosen": -7330.84619140625, "logps/rejected": -11870.203125, "loss": -28.2878, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7077.7724609375, "rewards/margins": 4533.67431640625, "rewards/rejected": -11611.447265625, "step": 3210 }, { "epoch": 0.6319921491658489, "grad_norm": 37000.804055841756, "learning_rate": 1.7945480582203745e-07, "logits/chosen": -6.002919673919678, "logits/rejected": -6.045916557312012, "logps/chosen": -9654.12890625, "logps/rejected": -14309.685546875, "loss": -48.176, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -9405.287109375, "rewards/margins": 4636.5673828125, "rewards/rejected": -14041.8544921875, "step": 3220 }, { "epoch": 0.633954857703631, "grad_norm": 30212.427138494844, "learning_rate": 1.7781311465804128e-07, "logits/chosen": -6.6009650230407715, "logits/rejected": -6.658873558044434, "logps/chosen": -10841.62109375, "logps/rejected": -13201.4658203125, "loss": -26.6506, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -10562.521484375, "rewards/margins": 2405.37841796875, "rewards/rejected": -12967.900390625, "step": 3230 }, { "epoch": 0.6359175662414132, "grad_norm": 19090.48850906681, "learning_rate": 1.7617481254165487e-07, "logits/chosen": -6.290364742279053, "logits/rejected": -6.553230285644531, "logps/chosen": -9820.6904296875, "logps/rejected": -14816.169921875, "loss": -36.8796, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9549.0849609375, "rewards/margins": 5044.22265625, "rewards/rejected": -14593.3076171875, "step": 3240 }, { "epoch": 0.6378802747791953, "grad_norm": 7591.545709353683, "learning_rate": 1.745399763882881e-07, "logits/chosen": -5.728682518005371, "logits/rejected": -5.9819536209106445, "logps/chosen": -3211.0498046875, "logps/rejected": -10394.482421875, "loss": -59.9801, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -2921.296875, "rewards/margins": 7196.39990234375, "rewards/rejected": -10117.697265625, "step": 3250 }, { "epoch": 0.6398429833169774, "grad_norm": 33411.75386192735, "learning_rate": 1.7290868295062983e-07, "logits/chosen": -5.793447494506836, "logits/rejected": -5.8636474609375, "logps/chosen": -5855.99951171875, "logps/rejected": -8921.2919921875, "loss": -51.7521, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -5568.60791015625, "rewards/margins": 3067.75732421875, "rewards/rejected": -8636.365234375, "step": 3260 }, { "epoch": 0.6418056918547596, "grad_norm": 16067.34472519018, "learning_rate": 1.7128100881504492e-07, "logits/chosen": -5.714947700500488, "logits/rejected": -6.102677822113037, "logps/chosen": -5039.84619140625, "logps/rejected": -7595.53662109375, "loss": -20.5676, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -4753.2724609375, "rewards/margins": 2624.868896484375, "rewards/rejected": -7378.1416015625, "step": 3270 }, { "epoch": 0.6437684003925417, "grad_norm": 39619.80097302904, "learning_rate": 1.6965703039797808e-07, "logits/chosen": -5.740197658538818, "logits/rejected": -5.711764335632324, "logps/chosen": -8567.548828125, "logps/rejected": -8937.107421875, "loss": -34.2916, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8252.1162109375, "rewards/margins": 458.0352478027344, "rewards/rejected": -8710.15234375, "step": 3280 }, { "epoch": 0.6457311089303238, "grad_norm": 31376.486237086294, "learning_rate": 1.6803682394236656e-07, "logits/chosen": -5.570681571960449, "logits/rejected": -6.147989273071289, "logps/chosen": -5871.93603515625, "logps/rejected": -10064.046875, "loss": -18.3966, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -5539.5986328125, "rewards/margins": 4255.6005859375, "rewards/rejected": -9795.19921875, "step": 3290 }, { "epoch": 0.647693817468106, "grad_norm": 44249.356923894535, "learning_rate": 1.664204655140607e-07, "logits/chosen": -5.888121128082275, "logits/rejected": -5.4714460372924805, "logps/chosen": -6671.8759765625, "logps/rejected": -9905.056640625, "loss": -56.0949, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -6434.58642578125, "rewards/margins": 3191.84619140625, "rewards/rejected": -9626.431640625, "step": 3300 }, { "epoch": 0.6496565260058881, "grad_norm": 31042.589164944533, "learning_rate": 1.6480803099825277e-07, "logits/chosen": -5.627917766571045, "logits/rejected": -6.0468244552612305, "logps/chosen": -7155.8828125, "logps/rejected": -11128.4873046875, "loss": -65.0139, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -6894.2548828125, "rewards/margins": 4013.03564453125, "rewards/rejected": -10907.2890625, "step": 3310 }, { "epoch": 0.6516192345436702, "grad_norm": 25005.20521539522, "learning_rate": 1.6319959609591412e-07, "logits/chosen": -5.991621017456055, "logits/rejected": -6.41912841796875, "logps/chosen": -7819.74853515625, "logps/rejected": -11558.2021484375, "loss": -31.5012, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -7577.1728515625, "rewards/margins": 3785.04345703125, "rewards/rejected": -11362.216796875, "step": 3320 }, { "epoch": 0.6535819430814525, "grad_norm": 19185.614506265567, "learning_rate": 1.6159523632024126e-07, "logits/chosen": -5.926007270812988, "logits/rejected": -5.851027965545654, "logps/chosen": -9957.6728515625, "logps/rejected": -10125.80859375, "loss": -20.0844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9668.2685546875, "rewards/margins": 141.96133422851562, "rewards/rejected": -9810.23046875, "step": 3330 }, { "epoch": 0.6555446516192346, "grad_norm": 32960.85393274271, "learning_rate": 1.599950269931107e-07, "logits/chosen": -5.916158676147461, "logits/rejected": -6.095315456390381, "logps/chosen": -8739.169921875, "logps/rejected": -8563.7607421875, "loss": -37.1153, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -8442.9326171875, "rewards/margins": -131.4261932373047, "rewards/rejected": -8311.5068359375, "step": 3340 }, { "epoch": 0.6575073601570167, "grad_norm": 29891.22243114801, "learning_rate": 1.5839904324154273e-07, "logits/chosen": -5.7002058029174805, "logits/rejected": -6.260601043701172, "logps/chosen": -4179.4189453125, "logps/rejected": -11660.3232421875, "loss": -56.3478, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -3894.395263671875, "rewards/margins": 7470.75634765625, "rewards/rejected": -11365.1513671875, "step": 3350 }, { "epoch": 0.6594700686947988, "grad_norm": 26110.469353522592, "learning_rate": 1.568073599941742e-07, "logits/chosen": -5.362790584564209, "logits/rejected": -5.80797815322876, "logps/chosen": -7910.3359375, "logps/rejected": -15544.359375, "loss": -61.4841, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7611.6669921875, "rewards/margins": 7659.12646484375, "rewards/rejected": -15270.794921875, "step": 3360 }, { "epoch": 0.661432777232581, "grad_norm": 17990.368363310597, "learning_rate": 1.552200519777408e-07, "logits/chosen": -5.812250137329102, "logits/rejected": -5.974585056304932, "logps/chosen": -8365.4921875, "logps/rejected": -9546.853515625, "loss": -45.0347, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -8046.81005859375, "rewards/margins": 1251.6302490234375, "rewards/rejected": -9298.4404296875, "step": 3370 }, { "epoch": 0.6633954857703631, "grad_norm": 27901.298177501503, "learning_rate": 1.5363719371356882e-07, "logits/chosen": -5.009722709655762, "logits/rejected": -5.893799781799316, "logps/chosen": -4510.7548828125, "logps/rejected": -12799.8349609375, "loss": -51.5022, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -4169.9951171875, "rewards/margins": 8402.2060546875, "rewards/rejected": -12572.2001953125, "step": 3380 }, { "epoch": 0.6653581943081452, "grad_norm": 35465.04076431891, "learning_rate": 1.5205885951407665e-07, "logits/chosen": -5.93643856048584, "logits/rejected": -5.664778709411621, "logps/chosen": -10530.7109375, "logps/rejected": -13409.2001953125, "loss": -40.6955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10276.90234375, "rewards/margins": 2829.52685546875, "rewards/rejected": -13106.4287109375, "step": 3390 }, { "epoch": 0.6673209028459274, "grad_norm": 24578.147368026366, "learning_rate": 1.5048512347928564e-07, "logits/chosen": -5.5513529777526855, "logits/rejected": -6.251401901245117, "logps/chosen": -9071.154296875, "logps/rejected": -11816.697265625, "loss": -60.1862, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -8782.8623046875, "rewards/margins": 2831.478271484375, "rewards/rejected": -11614.341796875, "step": 3400 }, { "epoch": 0.6692836113837095, "grad_norm": 28130.24056454807, "learning_rate": 1.4891605949334133e-07, "logits/chosen": -5.1147356033325195, "logits/rejected": -5.837545394897461, "logps/chosen": -10734.4208984375, "logps/rejected": -15567.869140625, "loss": -49.2724, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": -10261.611328125, "rewards/margins": 4926.6669921875, "rewards/rejected": -15188.279296875, "step": 3410 }, { "epoch": 0.6712463199214916, "grad_norm": 15093.648696774697, "learning_rate": 1.4735174122104476e-07, "logits/chosen": -5.686190605163574, "logits/rejected": -6.037934303283691, "logps/chosen": -5971.4521484375, "logps/rejected": -12716.578125, "loss": -44.5627, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -5732.052734375, "rewards/margins": 6785.609375, "rewards/rejected": -12517.662109375, "step": 3420 }, { "epoch": 0.6732090284592738, "grad_norm": 30406.610706584604, "learning_rate": 1.457922421043943e-07, "logits/chosen": -5.492599964141846, "logits/rejected": -5.887878894805908, "logps/chosen": -6392.69970703125, "logps/rejected": -6286.8212890625, "loss": -20.8657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6072.6572265625, "rewards/margins": 8.783960342407227, "rewards/rejected": -6081.44140625, "step": 3430 }, { "epoch": 0.6751717369970559, "grad_norm": 10959.423503774728, "learning_rate": 1.4423763535913704e-07, "logits/chosen": -5.693334579467773, "logits/rejected": -5.849954128265381, "logps/chosen": -7127.61865234375, "logps/rejected": -9831.96875, "loss": -29.1369, "rewards/accuracies": 0.6666667461395264, "rewards/chosen": -6860.1748046875, "rewards/margins": 2703.759033203125, "rewards/rejected": -9563.93359375, "step": 3440 }, { "epoch": 0.677134445534838, "grad_norm": 13565.274744649627, "learning_rate": 1.426879939713322e-07, "logits/chosen": -5.775158882141113, "logits/rejected": -5.952229976654053, "logps/chosen": -10307.3232421875, "logps/rejected": -13023.1416015625, "loss": -37.16, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10003.4775390625, "rewards/margins": 2784.02783203125, "rewards/rejected": -12787.505859375, "step": 3450 }, { "epoch": 0.6790971540726202, "grad_norm": 27566.5416441535, "learning_rate": 1.4114339069392374e-07, "logits/chosen": -5.485426902770996, "logits/rejected": -6.122304439544678, "logps/chosen": -7413.1904296875, "logps/rejected": -11015.4990234375, "loss": -51.0472, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -7116.50146484375, "rewards/margins": 3687.375, "rewards/rejected": -10803.876953125, "step": 3460 }, { "epoch": 0.6810598626104023, "grad_norm": 16167.87021072455, "learning_rate": 1.3960389804332556e-07, "logits/chosen": -5.63193416595459, "logits/rejected": -5.540692329406738, "logps/chosen": -5405.37451171875, "logps/rejected": -8511.1318359375, "loss": -48.8991, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -5148.23193359375, "rewards/margins": 3071.04541015625, "rewards/rejected": -8219.2763671875, "step": 3470 }, { "epoch": 0.6830225711481845, "grad_norm": 12257.326400363696, "learning_rate": 1.380695882960165e-07, "logits/chosen": -5.7272629737854, "logits/rejected": -5.8701629638671875, "logps/chosen": -6587.0107421875, "logps/rejected": -11632.650390625, "loss": -52.9292, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6314.02587890625, "rewards/margins": 5091.6328125, "rewards/rejected": -11405.6591796875, "step": 3480 }, { "epoch": 0.6849852796859667, "grad_norm": 43060.15519451069, "learning_rate": 1.3654053348514702e-07, "logits/chosen": -5.689966201782227, "logits/rejected": -5.83602237701416, "logps/chosen": -4095.103515625, "logps/rejected": -11215.408203125, "loss": -36.6181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3930.578857421875, "rewards/margins": 7083.15771484375, "rewards/rejected": -11013.7373046875, "step": 3490 }, { "epoch": 0.6869479882237488, "grad_norm": 22223.10094294949, "learning_rate": 1.350168053971577e-07, "logits/chosen": -6.1233649253845215, "logits/rejected": -5.788354873657227, "logps/chosen": -9106.62890625, "logps/rejected": -10510.11328125, "loss": -52.5913, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -8744.1220703125, "rewards/margins": 1526.274658203125, "rewards/rejected": -10270.396484375, "step": 3500 }, { "epoch": 0.6889106967615309, "grad_norm": 9260.91193983033, "learning_rate": 1.3349847556840876e-07, "logits/chosen": -6.035849094390869, "logits/rejected": -5.760616779327393, "logps/chosen": -9830.3125, "logps/rejected": -14624.6044921875, "loss": -49.607, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -9584.955078125, "rewards/margins": 4772.97607421875, "rewards/rejected": -14357.9326171875, "step": 3510 }, { "epoch": 0.6908734052993131, "grad_norm": 24105.276258028993, "learning_rate": 1.3198561528182182e-07, "logits/chosen": -5.777835369110107, "logits/rejected": -5.749192714691162, "logps/chosen": -9050.3603515625, "logps/rejected": -14486.859375, "loss": -51.4045, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -8851.3125, "rewards/margins": 5411.87109375, "rewards/rejected": -14263.1845703125, "step": 3520 }, { "epoch": 0.6928361138370952, "grad_norm": 25794.658171444513, "learning_rate": 1.3047829556353263e-07, "logits/chosen": -5.651608467102051, "logits/rejected": -5.828283309936523, "logps/chosen": -5177.1748046875, "logps/rejected": -10207.8232421875, "loss": -26.6374, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -4924.34814453125, "rewards/margins": 5043.4482421875, "rewards/rejected": -9967.7958984375, "step": 3530 }, { "epoch": 0.6947988223748773, "grad_norm": 12011.69849945466, "learning_rate": 1.2897658717955742e-07, "logits/chosen": -5.7347564697265625, "logits/rejected": -5.7237043380737305, "logps/chosen": -5554.14794921875, "logps/rejected": -10123.296875, "loss": -31.3124, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -5291.82080078125, "rewards/margins": 4617.94384765625, "rewards/rejected": -9909.7666015625, "step": 3540 }, { "epoch": 0.6967615309126595, "grad_norm": 14446.751906990605, "learning_rate": 1.2748056063246994e-07, "logits/chosen": -5.635600566864014, "logits/rejected": -5.718235969543457, "logps/chosen": -6931.3291015625, "logps/rejected": -10474.2275390625, "loss": -32.8063, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -6629.62109375, "rewards/margins": 3555.21728515625, "rewards/rejected": -10184.837890625, "step": 3550 }, { "epoch": 0.6987242394504416, "grad_norm": 19397.57692796766, "learning_rate": 1.2599028615809183e-07, "logits/chosen": -5.620645046234131, "logits/rejected": -6.010846138000488, "logps/chosen": -5239.66748046875, "logps/rejected": -13830.7861328125, "loss": -52.4162, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -4942.888671875, "rewards/margins": 8649.3173828125, "rewards/rejected": -13592.2060546875, "step": 3560 }, { "epoch": 0.7006869479882237, "grad_norm": 12892.601424103728, "learning_rate": 1.2450583372219458e-07, "logits/chosen": -5.7722625732421875, "logits/rejected": -6.254805564880371, "logps/chosen": -5447.4736328125, "logps/rejected": -16814.921875, "loss": -65.9521, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -5144.95849609375, "rewards/margins": 11374.75, "rewards/rejected": -16519.708984375, "step": 3570 }, { "epoch": 0.7026496565260059, "grad_norm": 17419.92706564986, "learning_rate": 1.230272730172157e-07, "logits/chosen": -5.847105979919434, "logits/rejected": -5.951395034790039, "logps/chosen": -7758.01953125, "logps/rejected": -11443.423828125, "loss": -22.3084, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7461.6025390625, "rewards/margins": 3674.16748046875, "rewards/rejected": -11135.7705078125, "step": 3580 }, { "epoch": 0.704612365063788, "grad_norm": 7640.052250826008, "learning_rate": 1.2155467345898602e-07, "logits/chosen": -5.884621620178223, "logits/rejected": -6.429002285003662, "logps/chosen": -11281.388671875, "logps/rejected": -14346.2421875, "loss": -55.8164, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -11013.9287109375, "rewards/margins": 3018.797119140625, "rewards/rejected": -14032.728515625, "step": 3590 }, { "epoch": 0.7065750736015701, "grad_norm": 40020.578810813546, "learning_rate": 1.2008810418347093e-07, "logits/chosen": -6.366058826446533, "logits/rejected": -6.1777167320251465, "logps/chosen": -6046.31396484375, "logps/rejected": -12688.7060546875, "loss": -68.5755, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -5845.25537109375, "rewards/margins": 6631.65234375, "rewards/rejected": -12476.908203125, "step": 3600 }, { "epoch": 0.7085377821393523, "grad_norm": 19537.477788360422, "learning_rate": 1.1862763404352483e-07, "logits/chosen": -5.961631774902344, "logits/rejected": -6.1319499015808105, "logps/chosen": -14600.4951171875, "logps/rejected": -17471.859375, "loss": -42.7523, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -14286.4189453125, "rewards/margins": 2906.14013671875, "rewards/rejected": -17192.55859375, "step": 3610 }, { "epoch": 0.7105004906771345, "grad_norm": 21492.530907713943, "learning_rate": 1.1717333160565807e-07, "logits/chosen": -5.414709091186523, "logits/rejected": -6.407537937164307, "logps/chosen": -8014.40771484375, "logps/rejected": -18653.47265625, "loss": -70.7611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7656.70068359375, "rewards/margins": 10714.4248046875, "rewards/rejected": -18371.125, "step": 3620 }, { "epoch": 0.7124631992149166, "grad_norm": 39133.684475217204, "learning_rate": 1.1572526514681874e-07, "logits/chosen": -6.152876853942871, "logits/rejected": -5.5300469398498535, "logps/chosen": -9903.3779296875, "logps/rejected": -6850.9755859375, "loss": -2.9947, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -9605.357421875, "rewards/margins": -3082.454345703125, "rewards/rejected": -6522.90380859375, "step": 3630 }, { "epoch": 0.7144259077526988, "grad_norm": 34059.677157498234, "learning_rate": 1.1428350265118613e-07, "logits/chosen": -5.830441474914551, "logits/rejected": -5.820923805236816, "logps/chosen": -11372.6259765625, "logps/rejected": -14096.7373046875, "loss": -41.9517, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -11055.4462890625, "rewards/margins": 2764.27099609375, "rewards/rejected": -13819.71875, "step": 3640 }, { "epoch": 0.7163886162904809, "grad_norm": 27068.50282817486, "learning_rate": 1.128481118069799e-07, "logits/chosen": -5.646975517272949, "logits/rejected": -5.4839653968811035, "logps/chosen": -8316.890625, "logps/rejected": -14379.205078125, "loss": -56.0848, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -8045.86083984375, "rewards/margins": 6058.0888671875, "rewards/rejected": -14103.9501953125, "step": 3650 }, { "epoch": 0.718351324828263, "grad_norm": 32940.2763329967, "learning_rate": 1.114191600032815e-07, "logits/chosen": -6.053099632263184, "logits/rejected": -6.327016830444336, "logps/chosen": -10874.2275390625, "logps/rejected": -14570.7607421875, "loss": -53.2441, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -10562.61328125, "rewards/margins": 3734.92724609375, "rewards/rejected": -14297.541015625, "step": 3660 }, { "epoch": 0.7203140333660452, "grad_norm": 16739.651820881187, "learning_rate": 1.0999671432687099e-07, "logits/chosen": -5.234135150909424, "logits/rejected": -5.513120174407959, "logps/chosen": -6768.1259765625, "logps/rejected": -10713.5712890625, "loss": -50.3025, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6469.193359375, "rewards/margins": 4007.73779296875, "rewards/rejected": -10476.9296875, "step": 3670 }, { "epoch": 0.7222767419038273, "grad_norm": 22299.55027796075, "learning_rate": 1.085808415590772e-07, "logits/chosen": -5.438049793243408, "logits/rejected": -6.078253746032715, "logps/chosen": -7943.3818359375, "logps/rejected": -15121.7783203125, "loss": -46.3061, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -7657.62353515625, "rewards/margins": 7216.3935546875, "rewards/rejected": -14874.017578125, "step": 3680 }, { "epoch": 0.7242394504416094, "grad_norm": 5392.217774942793, "learning_rate": 1.0717160817264217e-07, "logits/chosen": -5.76591682434082, "logits/rejected": -5.573340892791748, "logps/chosen": -8868.09375, "logps/rejected": -9070.2822265625, "loss": -45.8592, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -8595.615234375, "rewards/margins": 206.05654907226562, "rewards/rejected": -8801.673828125, "step": 3690 }, { "epoch": 0.7262021589793916, "grad_norm": 29470.05892956533, "learning_rate": 1.0576908032860088e-07, "logits/chosen": -5.662121295928955, "logits/rejected": -6.009718894958496, "logps/chosen": -7047.3759765625, "logps/rejected": -10066.947265625, "loss": -38.0695, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -6790.86865234375, "rewards/margins": 3069.06494140625, "rewards/rejected": -9859.9345703125, "step": 3700 }, { "epoch": 0.7281648675171737, "grad_norm": 6878.745510680303, "learning_rate": 1.0437332387317474e-07, "logits/chosen": -5.622027397155762, "logits/rejected": -5.5018415451049805, "logps/chosen": -7321.5439453125, "logps/rejected": -8870.005859375, "loss": -23.6314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7076.109375, "rewards/margins": 1581.976806640625, "rewards/rejected": -8658.0859375, "step": 3710 }, { "epoch": 0.7301275760549558, "grad_norm": 50131.16809863452, "learning_rate": 1.0298440433468048e-07, "logits/chosen": -5.270399570465088, "logits/rejected": -5.671762943267822, "logps/chosen": -6482.0595703125, "logps/rejected": -10487.4365234375, "loss": -23.5386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6144.5380859375, "rewards/margins": 4090.76025390625, "rewards/rejected": -10235.296875, "step": 3720 }, { "epoch": 0.732090284592738, "grad_norm": 43547.184044162415, "learning_rate": 1.0160238692045331e-07, "logits/chosen": -5.1884074211120605, "logits/rejected": -5.451391696929932, "logps/chosen": -3623.833984375, "logps/rejected": -6053.36474609375, "loss": -37.4796, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -3372.44580078125, "rewards/margins": 2472.100830078125, "rewards/rejected": -5844.54638671875, "step": 3730 }, { "epoch": 0.7340529931305201, "grad_norm": 13553.76351777675, "learning_rate": 1.0022733651378606e-07, "logits/chosen": -5.392740726470947, "logits/rejected": -5.570335388183594, "logps/chosen": -5804.17529296875, "logps/rejected": -11417.943359375, "loss": -38.1214, "rewards/accuracies": 0.7333332300186157, "rewards/chosen": -5431.228515625, "rewards/margins": 5716.70947265625, "rewards/rejected": -11147.9375, "step": 3740 }, { "epoch": 0.7360157016683022, "grad_norm": 20604.65999225014, "learning_rate": 9.88593176708827e-08, "logits/chosen": -5.883467197418213, "logits/rejected": -5.551784038543701, "logps/chosen": -4774.76025390625, "logps/rejected": -7942.04833984375, "loss": -33.6705, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -4499.48193359375, "rewards/margins": 3156.81591796875, "rewards/rejected": -7656.29833984375, "step": 3750 }, { "epoch": 0.7379784102060843, "grad_norm": 29751.60443625463, "learning_rate": 9.749839461782769e-08, "logits/chosen": -5.326167106628418, "logits/rejected": -5.301980018615723, "logps/chosen": -2632.874755859375, "logps/rejected": -11264.5634765625, "loss": -59.9892, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -2383.447998046875, "rewards/margins": 8588.8232421875, "rewards/rejected": -10972.271484375, "step": 3760 }, { "epoch": 0.7399411187438666, "grad_norm": 15512.719642761493, "learning_rate": 9.614463124757041e-08, "logits/chosen": -5.4546098709106445, "logits/rejected": -6.272017478942871, "logps/chosen": -3299.74267578125, "logps/rejected": -10026.826171875, "loss": -29.0896, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3063.593994140625, "rewards/margins": 6730.1962890625, "rewards/rejected": -9793.7890625, "step": 3770 }, { "epoch": 0.7419038272816487, "grad_norm": 48009.6276310012, "learning_rate": 9.479809111692586e-08, "logits/chosen": -5.556764602661133, "logits/rejected": -5.344692707061768, "logps/chosen": -8063.296875, "logps/rejected": -7475.7392578125, "loss": -31.9567, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -7828.13916015625, "rewards/margins": -633.9149169921875, "rewards/rejected": -7194.22412109375, "step": 3780 }, { "epoch": 0.7438665358194309, "grad_norm": 17403.65035482521, "learning_rate": 9.345883744359065e-08, "logits/chosen": -5.4752197265625, "logits/rejected": -5.5923075675964355, "logps/chosen": -8765.080078125, "logps/rejected": -11439.1669921875, "loss": -32.033, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -8495.0302734375, "rewards/margins": 2608.023681640625, "rewards/rejected": -11103.052734375, "step": 3790 }, { "epoch": 0.745829244357213, "grad_norm": 32226.24280670446, "learning_rate": 9.212693310317479e-08, "logits/chosen": -5.248085975646973, "logits/rejected": -5.685252666473389, "logps/chosen": -5042.3291015625, "logps/rejected": -10704.4140625, "loss": -38.8539, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -4788.63037109375, "rewards/margins": 5673.7236328125, "rewards/rejected": -10462.3525390625, "step": 3800 }, { "epoch": 0.7477919528949951, "grad_norm": 21818.38154378926, "learning_rate": 9.08024406262503e-08, "logits/chosen": -5.292876243591309, "logits/rejected": -5.627497673034668, "logps/chosen": -5077.37548828125, "logps/rejected": -10055.001953125, "loss": -42.1256, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -4832.48974609375, "rewards/margins": 4972.6845703125, "rewards/rejected": -9805.173828125, "step": 3810 }, { "epoch": 0.7497546614327772, "grad_norm": 20995.213410815093, "learning_rate": 8.94854221954148e-08, "logits/chosen": -5.9033379554748535, "logits/rejected": -5.630743980407715, "logps/chosen": -6463.76171875, "logps/rejected": -5801.787109375, "loss": -56.3927, "rewards/accuracies": 0.5, "rewards/chosen": -6230.32861328125, "rewards/margins": -633.3435668945312, "rewards/rejected": -5596.9853515625, "step": 3820 }, { "epoch": 0.7517173699705594, "grad_norm": 20001.863179400112, "learning_rate": 8.817593964237316e-08, "logits/chosen": -5.720893383026123, "logits/rejected": -5.971555233001709, "logps/chosen": -9240.984375, "logps/rejected": -14776.212890625, "loss": -66.9254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8961.67578125, "rewards/margins": 5570.7958984375, "rewards/rejected": -14532.4736328125, "step": 3830 }, { "epoch": 0.7536800785083415, "grad_norm": 30932.470865444026, "learning_rate": 8.68740544450334e-08, "logits/chosen": -5.658432960510254, "logits/rejected": -5.420456886291504, "logps/chosen": -9087.0361328125, "logps/rejected": -9325.5185546875, "loss": -20.1876, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -8726.701171875, "rewards/margins": 330.46563720703125, "rewards/rejected": -9057.1669921875, "step": 3840 }, { "epoch": 0.7556427870461236, "grad_norm": 14018.49459178659, "learning_rate": 8.557982772462138e-08, "logits/chosen": -5.9767022132873535, "logits/rejected": -5.852444648742676, "logps/chosen": -7218.5703125, "logps/rejected": -14626.060546875, "loss": -53.6391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6955.6845703125, "rewards/margins": 7419.10791015625, "rewards/rejected": -14374.7939453125, "step": 3850 }, { "epoch": 0.7576054955839058, "grad_norm": 19225.844294170587, "learning_rate": 8.429332024281088e-08, "logits/chosen": -5.76236629486084, "logits/rejected": -5.509146690368652, "logps/chosen": -6295.4033203125, "logps/rejected": -8247.8720703125, "loss": -39.7431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6009.4970703125, "rewards/margins": 2021.494873046875, "rewards/rejected": -8030.9912109375, "step": 3860 }, { "epoch": 0.7595682041216879, "grad_norm": 20327.81019167429, "learning_rate": 8.301459239887073e-08, "logits/chosen": -5.667635917663574, "logits/rejected": -5.806612968444824, "logps/chosen": -9341.2939453125, "logps/rejected": -14411.896484375, "loss": -43.2259, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -9001.431640625, "rewards/margins": 5144.4443359375, "rewards/rejected": -14145.876953125, "step": 3870 }, { "epoch": 0.76153091265947, "grad_norm": 25699.683290534067, "learning_rate": 8.17437042268298e-08, "logits/chosen": -5.749732494354248, "logits/rejected": -5.4239702224731445, "logps/chosen": -12584.0419921875, "logps/rejected": -12023.9599609375, "loss": -24.3332, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -12295.3232421875, "rewards/margins": -577.3702392578125, "rewards/rejected": -11717.955078125, "step": 3880 }, { "epoch": 0.7634936211972522, "grad_norm": 38257.34614190427, "learning_rate": 8.048071539265761e-08, "logits/chosen": -5.883009910583496, "logits/rejected": -5.78053617477417, "logps/chosen": -13877.400390625, "logps/rejected": -11593.130859375, "loss": -35.3128, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -13562.9453125, "rewards/margins": -2201.19921875, "rewards/rejected": -11361.74609375, "step": 3890 }, { "epoch": 0.7654563297350343, "grad_norm": 13177.316357458278, "learning_rate": 7.922568519146425e-08, "logits/chosen": -5.8899431228637695, "logits/rejected": -5.654213905334473, "logps/chosen": -6354.080078125, "logps/rejected": -13820.048828125, "loss": -36.8126, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -6127.3232421875, "rewards/margins": 7487.5830078125, "rewards/rejected": -13614.904296875, "step": 3900 }, { "epoch": 0.7674190382728164, "grad_norm": 17753.572433480378, "learning_rate": 7.79786725447154e-08, "logits/chosen": -5.511194229125977, "logits/rejected": -5.529297828674316, "logps/chosen": -6008.818359375, "logps/rejected": -11394.099609375, "loss": -49.3839, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5748.92138671875, "rewards/margins": 5429.52197265625, "rewards/rejected": -11178.443359375, "step": 3910 }, { "epoch": 0.7693817468105987, "grad_norm": 26223.176320372742, "learning_rate": 7.6739735997467e-08, "logits/chosen": -5.352577209472656, "logits/rejected": -5.448346138000488, "logps/chosen": -6656.86474609375, "logps/rejected": -12605.685546875, "loss": -35.1395, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6359.06591796875, "rewards/margins": 5986.4375, "rewards/rejected": -12345.5048828125, "step": 3920 }, { "epoch": 0.7713444553483808, "grad_norm": 26356.39572511418, "learning_rate": 7.550893371561593e-08, "logits/chosen": -5.414185523986816, "logits/rejected": -5.726749897003174, "logps/chosen": -5058.373046875, "logps/rejected": -9720.19140625, "loss": -57.0665, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -4790.02685546875, "rewards/margins": 4685.4267578125, "rewards/rejected": -9475.4541015625, "step": 3930 }, { "epoch": 0.7733071638861629, "grad_norm": 21875.90085054338, "learning_rate": 7.428632348317004e-08, "logits/chosen": -5.67410135269165, "logits/rejected": -6.12750768661499, "logps/chosen": -7117.21875, "logps/rejected": -12323.5205078125, "loss": -42.672, "rewards/accuracies": 0.5666666030883789, "rewards/chosen": -6868.81103515625, "rewards/margins": 5196.22900390625, "rewards/rejected": -12065.0390625, "step": 3940 }, { "epoch": 0.7752698724239451, "grad_norm": 21595.679278865802, "learning_rate": 7.307196269953444e-08, "logits/chosen": -5.398061275482178, "logits/rejected": -5.71030330657959, "logps/chosen": -7759.0908203125, "logps/rejected": -13286.1748046875, "loss": -30.0773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7475.99462890625, "rewards/margins": 5547.39794921875, "rewards/rejected": -13023.392578125, "step": 3950 }, { "epoch": 0.7772325809617272, "grad_norm": 23694.48259135807, "learning_rate": 7.186590837681732e-08, "logits/chosen": -5.498406410217285, "logits/rejected": -5.7211127281188965, "logps/chosen": -7438.23046875, "logps/rejected": -11851.1318359375, "loss": -38.2046, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7175.3916015625, "rewards/margins": 4468.5146484375, "rewards/rejected": -11643.90625, "step": 3960 }, { "epoch": 0.7791952894995093, "grad_norm": 91048.6017802154, "learning_rate": 7.066821713715293e-08, "logits/chosen": -5.3139238357543945, "logits/rejected": -5.605257987976074, "logps/chosen": -6156.630859375, "logps/rejected": -13442.537109375, "loss": -55.816, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -5841.7421875, "rewards/margins": 7314.96484375, "rewards/rejected": -13156.7080078125, "step": 3970 }, { "epoch": 0.7811579980372915, "grad_norm": 21043.652022561655, "learning_rate": 6.947894521004357e-08, "logits/chosen": -5.202340602874756, "logits/rejected": -5.405139923095703, "logps/chosen": -4332.08251953125, "logps/rejected": -10243.779296875, "loss": -54.5117, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -4050.005859375, "rewards/margins": 5901.56591796875, "rewards/rejected": -9951.572265625, "step": 3980 }, { "epoch": 0.7831207065750736, "grad_norm": 35069.3728078974, "learning_rate": 6.829814842971965e-08, "logits/chosen": -5.564299583435059, "logits/rejected": -5.327577114105225, "logps/chosen": -6377.2529296875, "logps/rejected": -9955.759765625, "loss": -16.8415, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -6151.12744140625, "rewards/margins": 3556.9140625, "rewards/rejected": -9708.0419921875, "step": 3990 }, { "epoch": 0.7850834151128557, "grad_norm": 33812.716870350145, "learning_rate": 6.712588223251809e-08, "logits/chosen": -5.008543491363525, "logits/rejected": -5.671880722045898, "logps/chosen": -7289.59375, "logps/rejected": -16076.5654296875, "loss": -58.2168, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -6948.9599609375, "rewards/margins": 8847.759765625, "rewards/rejected": -15796.7216796875, "step": 4000 }, { "epoch": 0.7870461236506379, "grad_norm": 46627.702489882045, "learning_rate": 6.596220165428002e-08, "logits/chosen": -5.599554538726807, "logits/rejected": -5.746572494506836, "logps/chosen": -9348.205078125, "logps/rejected": -12383.8818359375, "loss": -38.2762, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -9099.185546875, "rewards/margins": 3036.583984375, "rewards/rejected": -12135.7705078125, "step": 4010 }, { "epoch": 0.78900883218842, "grad_norm": 15365.882490024022, "learning_rate": 6.48071613277669e-08, "logits/chosen": -5.43951940536499, "logits/rejected": -5.7966766357421875, "logps/chosen": -9795.9091796875, "logps/rejected": -14209.3544921875, "loss": -46.0584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9555.9130859375, "rewards/margins": 4385.2958984375, "rewards/rejected": -13941.208984375, "step": 4020 }, { "epoch": 0.7909715407262021, "grad_norm": 19634.368213662914, "learning_rate": 6.366081548009553e-08, "logits/chosen": -5.461313724517822, "logits/rejected": -5.752083778381348, "logps/chosen": -8086.28369140625, "logps/rejected": -11382.6728515625, "loss": -49.1907, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -7819.33984375, "rewards/margins": 3302.328857421875, "rewards/rejected": -11121.6689453125, "step": 4030 }, { "epoch": 0.7929342492639843, "grad_norm": 23650.924955487062, "learning_rate": 6.252321793019192e-08, "logits/chosen": -5.650764465332031, "logits/rejected": -5.564217567443848, "logps/chosen": -7799.43212890625, "logps/rejected": -11430.5380859375, "loss": -42.4354, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -7567.69677734375, "rewards/margins": 3615.09912109375, "rewards/rejected": -11182.794921875, "step": 4040 }, { "epoch": 0.7948969578017664, "grad_norm": 13504.018410762326, "learning_rate": 6.139442208626517e-08, "logits/chosen": -5.818031311035156, "logits/rejected": -6.002923965454102, "logps/chosen": -5654.96728515625, "logps/rejected": -9241.576171875, "loss": -42.2606, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -5463.09521484375, "rewards/margins": 3618.589111328125, "rewards/rejected": -9081.685546875, "step": 4050 }, { "epoch": 0.7968596663395485, "grad_norm": 17465.40602729159, "learning_rate": 6.027448094329963e-08, "logits/chosen": -5.401460647583008, "logits/rejected": -5.435824871063232, "logps/chosen": -5476.2783203125, "logps/rejected": -10484.3857421875, "loss": -73.0351, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5233.18994140625, "rewards/margins": 4978.724609375, "rewards/rejected": -10211.9150390625, "step": 4060 }, { "epoch": 0.7988223748773308, "grad_norm": 18229.959852643402, "learning_rate": 5.916344708056681e-08, "logits/chosen": -5.743640422821045, "logits/rejected": -5.654270648956299, "logps/chosen": -8351.8212890625, "logps/rejected": -10197.4482421875, "loss": -51.9345, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -8082.64306640625, "rewards/margins": 1882.427734375, "rewards/rejected": -9965.0703125, "step": 4070 }, { "epoch": 0.8007850834151129, "grad_norm": 11080.095394396749, "learning_rate": 5.8061372659157306e-08, "logits/chosen": -5.764735698699951, "logits/rejected": -5.966015338897705, "logps/chosen": -9404.951171875, "logps/rejected": -17033.86328125, "loss": -48.6827, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -9054.53515625, "rewards/margins": 7650.359375, "rewards/rejected": -16704.89453125, "step": 4080 }, { "epoch": 0.802747791952895, "grad_norm": 22329.14392123284, "learning_rate": 5.6968309419531376e-08, "logits/chosen": -5.461724281311035, "logits/rejected": -5.526611328125, "logps/chosen": -8488.25390625, "logps/rejected": -13745.1279296875, "loss": -63.2434, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8199.080078125, "rewards/margins": 5294.1240234375, "rewards/rejected": -13493.205078125, "step": 4090 }, { "epoch": 0.8047105004906772, "grad_norm": 35195.10011213242, "learning_rate": 5.5884308679090525e-08, "logits/chosen": -5.658880233764648, "logits/rejected": -5.6876373291015625, "logps/chosen": -7975.4345703125, "logps/rejected": -7219.63427734375, "loss": -53.1387, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -7728.3525390625, "rewards/margins": -692.8310546875, "rewards/rejected": -7035.5205078125, "step": 4100 }, { "epoch": 0.8066732090284593, "grad_norm": 50535.91029368612, "learning_rate": 5.480942132976732e-08, "logits/chosen": -5.258803367614746, "logits/rejected": -5.646577835083008, "logps/chosen": -5729.68212890625, "logps/rejected": -11648.3525390625, "loss": -51.0113, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -5408.0751953125, "rewards/margins": 6046.53759765625, "rewards/rejected": -11454.61328125, "step": 4110 }, { "epoch": 0.8086359175662414, "grad_norm": 18546.772931990945, "learning_rate": 5.374369783563698e-08, "logits/chosen": -5.300049781799316, "logits/rejected": -5.793374538421631, "logps/chosen": -7394.32568359375, "logps/rejected": -14648.1748046875, "loss": -62.8568, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7115.16943359375, "rewards/margins": 7253.67041015625, "rewards/rejected": -14368.8408203125, "step": 4120 }, { "epoch": 0.8105986261040236, "grad_norm": 53338.81991787677, "learning_rate": 5.268718823054752e-08, "logits/chosen": -6.142602920532227, "logits/rejected": -5.655788421630859, "logps/chosen": -11563.791015625, "logps/rejected": -10590.697265625, "loss": -62.9226, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": -11317.578125, "rewards/margins": -974.4451904296875, "rewards/rejected": -10343.1328125, "step": 4130 }, { "epoch": 0.8125613346418057, "grad_norm": 30376.20776204416, "learning_rate": 5.1639942115771384e-08, "logits/chosen": -4.920853614807129, "logits/rejected": -5.563089370727539, "logps/chosen": -7852.92041015625, "logps/rejected": -13996.470703125, "loss": -35.0299, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -7619.9677734375, "rewards/margins": 6159.09423828125, "rewards/rejected": -13779.0625, "step": 4140 }, { "epoch": 0.8145240431795878, "grad_norm": 21224.075937372243, "learning_rate": 5.060200865767605e-08, "logits/chosen": -5.440930366516113, "logits/rejected": -5.209008693695068, "logps/chosen": -9694.376953125, "logps/rejected": -14167.064453125, "loss": -52.3943, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -9320.7099609375, "rewards/margins": 4542.91796875, "rewards/rejected": -13863.6279296875, "step": 4150 }, { "epoch": 0.81648675171737, "grad_norm": 17743.511187540764, "learning_rate": 4.957343658541632e-08, "logits/chosen": -5.6792802810668945, "logits/rejected": -5.494027137756348, "logps/chosen": -5562.3818359375, "logps/rejected": -14443.880859375, "loss": -49.3637, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5331.1142578125, "rewards/margins": 8843.1171875, "rewards/rejected": -14174.232421875, "step": 4160 }, { "epoch": 0.8184494602551521, "grad_norm": 24905.395574477814, "learning_rate": 4.8554274188646215e-08, "logits/chosen": -5.439173221588135, "logits/rejected": -5.015594482421875, "logps/chosen": -11793.6123046875, "logps/rejected": -11658.455078125, "loss": -39.0295, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": -11533.4423828125, "rewards/margins": -85.32148742675781, "rewards/rejected": -11448.12109375, "step": 4170 }, { "epoch": 0.8204121687929342, "grad_norm": 35286.99636356708, "learning_rate": 4.754456931525208e-08, "logits/chosen": -5.985587120056152, "logits/rejected": -6.0778985023498535, "logps/chosen": -8265.693359375, "logps/rejected": -17402.28515625, "loss": -79.9183, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -7998.1328125, "rewards/margins": 9134.271484375, "rewards/rejected": -17132.40234375, "step": 4180 }, { "epoch": 0.8223748773307163, "grad_norm": 13047.68543526814, "learning_rate": 4.654436936910622e-08, "logits/chosen": -5.178874969482422, "logits/rejected": -5.954102993011475, "logps/chosen": -9648.169921875, "logps/rejected": -17150.30859375, "loss": -40.4459, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9343.783203125, "rewards/margins": 7561.6513671875, "rewards/rejected": -16905.43359375, "step": 4190 }, { "epoch": 0.8243375858684985, "grad_norm": 38211.575162422516, "learning_rate": 4.555372130784102e-08, "logits/chosen": -5.521913051605225, "logits/rejected": -5.499879837036133, "logps/chosen": -6927.7392578125, "logps/rejected": -12410.0869140625, "loss": -38.8652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6547.15185546875, "rewards/margins": 5563.42919921875, "rewards/rejected": -12110.5810546875, "step": 4200 }, { "epoch": 0.8263002944062807, "grad_norm": 28915.449364668308, "learning_rate": 4.45726716406449e-08, "logits/chosen": -5.583122253417969, "logits/rejected": -5.537442684173584, "logps/chosen": -10077.97265625, "logps/rejected": -13339.6171875, "loss": -23.8868, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -9768.9853515625, "rewards/margins": 3312.250732421875, "rewards/rejected": -13081.2353515625, "step": 4210 }, { "epoch": 0.8282630029440629, "grad_norm": 15516.420871026046, "learning_rate": 4.360126642607842e-08, "logits/chosen": -5.351869106292725, "logits/rejected": -5.577881336212158, "logps/chosen": -6680.63427734375, "logps/rejected": -8888.21875, "loss": -63.1694, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6350.23046875, "rewards/margins": 2300.090087890625, "rewards/rejected": -8650.3193359375, "step": 4220 }, { "epoch": 0.830225711481845, "grad_norm": 26374.982232214992, "learning_rate": 4.2639551269912034e-08, "logits/chosen": -5.3291778564453125, "logits/rejected": -5.991674423217773, "logps/chosen": -4948.32568359375, "logps/rejected": -9210.630859375, "loss": -46.0437, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -4739.35205078125, "rewards/margins": 4280.1650390625, "rewards/rejected": -9019.517578125, "step": 4230 }, { "epoch": 0.8321884200196271, "grad_norm": 17675.33247824001, "learning_rate": 4.168757132298478e-08, "logits/chosen": -5.882918834686279, "logits/rejected": -5.621251106262207, "logps/chosen": -10704.8349609375, "logps/rejected": -11739.685546875, "loss": -10.4489, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -10446.00390625, "rewards/margins": 1017.0921630859375, "rewards/rejected": -11463.0966796875, "step": 4240 }, { "epoch": 0.8341511285574092, "grad_norm": 21941.278527406088, "learning_rate": 4.0745371279084976e-08, "logits/chosen": -5.577470779418945, "logits/rejected": -5.808836460113525, "logps/chosen": -6161.45654296875, "logps/rejected": -12138.236328125, "loss": -71.8074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5913.11279296875, "rewards/margins": 6000.79833984375, "rewards/rejected": -11913.91015625, "step": 4250 }, { "epoch": 0.8361138370951914, "grad_norm": 29403.85761590277, "learning_rate": 3.9812995372851544e-08, "logits/chosen": -5.705291271209717, "logits/rejected": -5.619935512542725, "logps/chosen": -7623.78662109375, "logps/rejected": -8382.2177734375, "loss": -31.5073, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -7376.5966796875, "rewards/margins": 781.0252685546875, "rewards/rejected": -8157.6220703125, "step": 4260 }, { "epoch": 0.8380765456329735, "grad_norm": 22638.75468780541, "learning_rate": 3.8890487377697265e-08, "logits/chosen": -5.617705345153809, "logits/rejected": -5.677297592163086, "logps/chosen": -6681.42724609375, "logps/rejected": -12282.83203125, "loss": -43.2563, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6427.31396484375, "rewards/margins": 5617.63818359375, "rewards/rejected": -12044.9521484375, "step": 4270 }, { "epoch": 0.8400392541707556, "grad_norm": 39882.53821735189, "learning_rate": 3.7977890603754e-08, "logits/chosen": -5.163455009460449, "logits/rejected": -5.4479570388793945, "logps/chosen": -5549.169921875, "logps/rejected": -13040.1474609375, "loss": -61.6358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5200.162109375, "rewards/margins": 7526.76953125, "rewards/rejected": -12726.931640625, "step": 4280 }, { "epoch": 0.8420019627085378, "grad_norm": 32986.66432472335, "learning_rate": 3.707524789583891e-08, "logits/chosen": -5.5402140617370605, "logits/rejected": -5.859433174133301, "logps/chosen": -9826.212890625, "logps/rejected": -14118.294921875, "loss": -47.5729, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -9512.4052734375, "rewards/margins": 4297.0986328125, "rewards/rejected": -13809.5029296875, "step": 4290 }, { "epoch": 0.8439646712463199, "grad_norm": 16171.264461728495, "learning_rate": 3.6182601631443596e-08, "logits/chosen": -5.416332244873047, "logits/rejected": -5.442586421966553, "logps/chosen": -6375.8056640625, "logps/rejected": -12668.638671875, "loss": -62.6272, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6052.4580078125, "rewards/margins": 6348.6259765625, "rewards/rejected": -12401.0830078125, "step": 4300 }, { "epoch": 0.845927379784102, "grad_norm": 90461.67221101571, "learning_rate": 3.529999371874381e-08, "logits/chosen": -6.126959800720215, "logits/rejected": -5.4796142578125, "logps/chosen": -9434.0078125, "logps/rejected": -12682.3076171875, "loss": -26.9017, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -9152.47265625, "rewards/margins": 3267.29541015625, "rewards/rejected": -12419.7685546875, "step": 4310 }, { "epoch": 0.8478900883218842, "grad_norm": 60279.06437726382, "learning_rate": 3.4427465594632555e-08, "logits/chosen": -5.6222615242004395, "logits/rejected": -5.9407148361206055, "logps/chosen": -8406.7080078125, "logps/rejected": -8478.1884765625, "loss": -55.0376, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8205.9912109375, "rewards/margins": 94.17915344238281, "rewards/rejected": -8300.1708984375, "step": 4320 }, { "epoch": 0.8498527968596663, "grad_norm": 38049.45324398646, "learning_rate": 3.356505822277417e-08, "logits/chosen": -5.746432304382324, "logits/rejected": -5.521189212799072, "logps/chosen": -8715.3173828125, "logps/rejected": -10882.662109375, "loss": -44.3867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8445.228515625, "rewards/margins": 2174.84375, "rewards/rejected": -10620.072265625, "step": 4330 }, { "epoch": 0.8518155053974484, "grad_norm": 28961.585066629013, "learning_rate": 3.271281209168186e-08, "logits/chosen": -5.579524993896484, "logits/rejected": -5.348021507263184, "logps/chosen": -11741.3349609375, "logps/rejected": -11505.5380859375, "loss": -21.501, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -11464.99609375, "rewards/margins": -175.85415649414062, "rewards/rejected": -11289.142578125, "step": 4340 }, { "epoch": 0.8537782139352306, "grad_norm": 37153.51804323864, "learning_rate": 3.187076721281595e-08, "logits/chosen": -5.76254415512085, "logits/rejected": -5.386960506439209, "logps/chosen": -7618.7861328125, "logps/rejected": -6885.86474609375, "loss": -26.7615, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -7373.1318359375, "rewards/margins": -726.4737548828125, "rewards/rejected": -6646.65869140625, "step": 4350 }, { "epoch": 0.8557409224730128, "grad_norm": 24537.166947137863, "learning_rate": 3.1038963118706244e-08, "logits/chosen": -5.747314929962158, "logits/rejected": -5.723934173583984, "logps/chosen": -9406.173828125, "logps/rejected": -16458.5, "loss": -68.8861, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9128.783203125, "rewards/margins": 7097.1103515625, "rewards/rejected": -16225.8935546875, "step": 4360 }, { "epoch": 0.8577036310107949, "grad_norm": 36643.16984723375, "learning_rate": 3.0217438861095315e-08, "logits/chosen": -5.537148475646973, "logits/rejected": -5.467263698577881, "logps/chosen": -5268.40380859375, "logps/rejected": -9365.9228515625, "loss": -60.0382, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5061.8369140625, "rewards/margins": 4082.012451171875, "rewards/rejected": -9143.849609375, "step": 4370 }, { "epoch": 0.8596663395485771, "grad_norm": 8017.9428043001235, "learning_rate": 2.940623300910572e-08, "logits/chosen": -5.250490665435791, "logits/rejected": -5.0932183265686035, "logps/chosen": -4774.69580078125, "logps/rejected": -7556.234375, "loss": -50.0535, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -4454.826171875, "rewards/margins": 2899.708984375, "rewards/rejected": -7354.53515625, "step": 4380 }, { "epoch": 0.8616290480863592, "grad_norm": 9406.06629751091, "learning_rate": 2.860538364742898e-08, "logits/chosen": -5.315346717834473, "logits/rejected": -5.702456474304199, "logps/chosen": -7435.3154296875, "logps/rejected": -16557.20703125, "loss": -66.4528, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -7065.48291015625, "rewards/margins": 9271.509765625, "rewards/rejected": -16336.990234375, "step": 4390 }, { "epoch": 0.8635917566241413, "grad_norm": 15741.510077024554, "learning_rate": 2.7814928374537334e-08, "logits/chosen": -5.091046333312988, "logits/rejected": -5.5814313888549805, "logps/chosen": -7725.3232421875, "logps/rejected": -13735.224609375, "loss": -39.1633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7499.1650390625, "rewards/margins": 6033.06201171875, "rewards/rejected": -13532.2265625, "step": 4400 }, { "epoch": 0.8655544651619235, "grad_norm": 28628.42421275461, "learning_rate": 2.7034904300918982e-08, "logits/chosen": -5.774806022644043, "logits/rejected": -5.638582229614258, "logps/chosen": -6584.2998046875, "logps/rejected": -12071.7568359375, "loss": -53.3256, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -6362.6357421875, "rewards/margins": 5446.296875, "rewards/rejected": -11808.931640625, "step": 4410 }, { "epoch": 0.8675171736997056, "grad_norm": 18242.345375088782, "learning_rate": 2.62653480473356e-08, "logits/chosen": -5.339226722717285, "logits/rejected": -5.340549468994141, "logps/chosen": -10654.8203125, "logps/rejected": -11784.619140625, "loss": -44.0694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10390.837890625, "rewards/margins": 1146.275146484375, "rewards/rejected": -11537.11328125, "step": 4420 }, { "epoch": 0.8694798822374877, "grad_norm": 6049.674372136059, "learning_rate": 2.550629574310309e-08, "logits/chosen": -5.854968070983887, "logits/rejected": -5.283340930938721, "logps/chosen": -6774.3193359375, "logps/rejected": -13617.611328125, "loss": -53.9708, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6534.8515625, "rewards/margins": 6799.39697265625, "rewards/rejected": -13334.25, "step": 4430 }, { "epoch": 0.8714425907752699, "grad_norm": 43103.39815471932, "learning_rate": 2.475778302439524e-08, "logits/chosen": -5.462701320648193, "logits/rejected": -5.49961519241333, "logps/chosen": -8610.2578125, "logps/rejected": -14456.6376953125, "loss": -56.2331, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -8293.6044921875, "rewards/margins": 5913.560546875, "rewards/rejected": -14207.166015625, "step": 4440 }, { "epoch": 0.873405299313052, "grad_norm": 41262.996032408206, "learning_rate": 2.4019845032570875e-08, "logits/chosen": -5.545894622802734, "logits/rejected": -5.655886650085449, "logps/chosen": -5399.1748046875, "logps/rejected": -18175.90234375, "loss": -75.4151, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -5127.03173828125, "rewards/margins": 12771.462890625, "rewards/rejected": -17898.494140625, "step": 4450 }, { "epoch": 0.8753680078508341, "grad_norm": 7421.8571311872265, "learning_rate": 2.3292516412524054e-08, "logits/chosen": -5.745242595672607, "logits/rejected": -5.500452041625977, "logps/chosen": -13109.3017578125, "logps/rejected": -9585.521484375, "loss": -38.0646, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": -12814.5478515625, "rewards/margins": -3448.942626953125, "rewards/rejected": -9365.6044921875, "step": 4460 }, { "epoch": 0.8773307163886163, "grad_norm": 24596.990764845574, "learning_rate": 2.2575831311057225e-08, "logits/chosen": -5.4594950675964355, "logits/rejected": -5.816103935241699, "logps/chosen": -6505.94677734375, "logps/rejected": -13033.5517578125, "loss": -41.202, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -6272.58837890625, "rewards/margins": 6527.29052734375, "rewards/rejected": -12799.87890625, "step": 4470 }, { "epoch": 0.8792934249263984, "grad_norm": 9481.48282319698, "learning_rate": 2.1869823375278483e-08, "logits/chosen": -5.614555358886719, "logits/rejected": -5.273251056671143, "logps/chosen": -7815.4326171875, "logps/rejected": -11197.814453125, "loss": -56.0671, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7602.0771484375, "rewards/margins": 3393.821533203125, "rewards/rejected": -10995.8994140625, "step": 4480 }, { "epoch": 0.8812561334641805, "grad_norm": 39689.84731742797, "learning_rate": 2.1174525751021578e-08, "logits/chosen": -5.441746711730957, "logits/rejected": -5.706095218658447, "logps/chosen": -5403.9716796875, "logps/rejected": -15337.544921875, "loss": -43.8152, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -5131.9970703125, "rewards/margins": 9934.451171875, "rewards/rejected": -15066.447265625, "step": 4490 }, { "epoch": 0.8832188420019627, "grad_norm": 24346.29263661773, "learning_rate": 2.0489971081290193e-08, "logits/chosen": -5.223479747772217, "logits/rejected": -5.908509731292725, "logps/chosen": -9484.626953125, "logps/rejected": -13132.865234375, "loss": -60.7269, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9186.548828125, "rewards/margins": 3705.84765625, "rewards/rejected": -12892.396484375, "step": 4500 }, { "epoch": 0.8851815505397449, "grad_norm": 27087.065993929875, "learning_rate": 1.9816191504724826e-08, "logits/chosen": -5.180727481842041, "logits/rejected": -5.385349273681641, "logps/chosen": -6385.72265625, "logps/rejected": -9435.9736328125, "loss": -45.3433, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6159.47216796875, "rewards/margins": 3061.211669921875, "rewards/rejected": -9220.685546875, "step": 4510 }, { "epoch": 0.887144259077527, "grad_norm": 9145.569064722426, "learning_rate": 1.9153218654094498e-08, "logits/chosen": -5.8254499435424805, "logits/rejected": -5.505230903625488, "logps/chosen": -8510.087890625, "logps/rejected": -13944.3125, "loss": -37.7843, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -8230.986328125, "rewards/margins": 5471.091796875, "rewards/rejected": -13702.078125, "step": 4520 }, { "epoch": 0.8891069676153092, "grad_norm": 4225.720571809484, "learning_rate": 1.8501083654811206e-08, "logits/chosen": -5.04479455947876, "logits/rejected": -5.729475498199463, "logps/chosen": -7710.2265625, "logps/rejected": -15623.0986328125, "loss": -71.2165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7413.94287109375, "rewards/margins": 7948.0185546875, "rewards/rejected": -15361.962890625, "step": 4530 }, { "epoch": 0.8910696761530913, "grad_norm": 39268.888500166264, "learning_rate": 1.7859817123469068e-08, "logits/chosen": -5.637434959411621, "logits/rejected": -5.875400066375732, "logps/chosen": -4491.77099609375, "logps/rejected": -10183.8134765625, "loss": -39.9892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4290.7333984375, "rewards/margins": 5663.5048828125, "rewards/rejected": -9954.2392578125, "step": 4540 }, { "epoch": 0.8930323846908734, "grad_norm": 6824.91582770684, "learning_rate": 1.7229449166406477e-08, "logits/chosen": -5.691622734069824, "logits/rejected": -6.149487018585205, "logps/chosen": -10521.505859375, "logps/rejected": -15587.568359375, "loss": -61.1856, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -10191.568359375, "rewards/margins": 5133.3173828125, "rewards/rejected": -15324.8857421875, "step": 4550 }, { "epoch": 0.8949950932286556, "grad_norm": 14696.985183487304, "learning_rate": 1.66100093782931e-08, "logits/chosen": -5.462733268737793, "logits/rejected": -5.765957832336426, "logps/chosen": -3220.137939453125, "logps/rejected": -13234.412109375, "loss": -96.2544, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -2958.337158203125, "rewards/margins": 9984.947265625, "rewards/rejected": -12943.283203125, "step": 4560 }, { "epoch": 0.8969578017664377, "grad_norm": 46383.41337693205, "learning_rate": 1.600152684074005e-08, "logits/chosen": -5.3520188331604, "logits/rejected": -5.641770362854004, "logps/chosen": -14836.9404296875, "logps/rejected": -14723.1611328125, "loss": -41.8942, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": -14513.6201171875, "rewards/margins": -107.5040054321289, "rewards/rejected": -14406.115234375, "step": 4570 }, { "epoch": 0.8989205103042198, "grad_norm": 14507.682430843983, "learning_rate": 1.540403012093483e-08, "logits/chosen": -5.395419120788574, "logits/rejected": -5.637910842895508, "logps/chosen": -6759.8974609375, "logps/rejected": -8852.4169921875, "loss": -46.6795, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -6451.7412109375, "rewards/margins": 2161.441162109375, "rewards/rejected": -8613.1826171875, "step": 4580 }, { "epoch": 0.900883218842002, "grad_norm": 24903.56816635334, "learning_rate": 1.4817547270300185e-08, "logits/chosen": -5.690587043762207, "logits/rejected": -5.284477710723877, "logps/chosen": -11572.6806640625, "logps/rejected": -15214.642578125, "loss": -26.0716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11304.857421875, "rewards/margins": 3559.34912109375, "rewards/rejected": -14864.2080078125, "step": 4590 }, { "epoch": 0.9028459273797841, "grad_norm": 4775.7918327393445, "learning_rate": 1.4242105823176837e-08, "logits/chosen": -5.056507110595703, "logits/rejected": -5.057549953460693, "logps/chosen": -4456.9970703125, "logps/rejected": -13321.458984375, "loss": -53.544, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -4137.091796875, "rewards/margins": 8929.5947265625, "rewards/rejected": -13066.685546875, "step": 4600 }, { "epoch": 0.9048086359175662, "grad_norm": 29688.982280883767, "learning_rate": 1.3677732795531083e-08, "logits/chosen": -5.7876739501953125, "logits/rejected": -5.276827335357666, "logps/chosen": -10445.51171875, "logps/rejected": -14212.072265625, "loss": -27.6961, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -10181.3603515625, "rewards/margins": 3716.388671875, "rewards/rejected": -13897.75, "step": 4610 }, { "epoch": 0.9067713444553483, "grad_norm": 18092.266256542196, "learning_rate": 1.3124454683686364e-08, "logits/chosen": -5.048381328582764, "logits/rejected": -5.342141151428223, "logps/chosen": -8399.037109375, "logps/rejected": -13159.658203125, "loss": -40.0481, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -8149.17724609375, "rewards/margins": 4746.85693359375, "rewards/rejected": -12896.03515625, "step": 4620 }, { "epoch": 0.9087340529931305, "grad_norm": 4384.306790168832, "learning_rate": 1.2582297463079288e-08, "logits/chosen": -5.83663272857666, "logits/rejected": -5.49135160446167, "logps/chosen": -9881.5234375, "logps/rejected": -10213.7275390625, "loss": -27.6701, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -9652.45703125, "rewards/margins": 406.330322265625, "rewards/rejected": -10058.7861328125, "step": 4630 }, { "epoch": 0.9106967615309126, "grad_norm": 4745.075194751322, "learning_rate": 1.2051286587040049e-08, "logits/chosen": -5.543376922607422, "logits/rejected": -5.850880146026611, "logps/chosen": -4459.736328125, "logps/rejected": -14243.8076171875, "loss": -68.886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4179.37255859375, "rewards/margins": 9787.7392578125, "rewards/rejected": -13967.111328125, "step": 4640 }, { "epoch": 0.9126594700686947, "grad_norm": 11415.125543920907, "learning_rate": 1.1531446985597604e-08, "logits/chosen": -5.12033224105835, "logits/rejected": -5.369235992431641, "logps/chosen": -7207.2412109375, "logps/rejected": -12399.3671875, "loss": -46.5387, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -6832.0380859375, "rewards/margins": 5281.021484375, "rewards/rejected": -12113.0595703125, "step": 4650 }, { "epoch": 0.914622178606477, "grad_norm": 50383.94488134638, "learning_rate": 1.1022803064309194e-08, "logits/chosen": -5.147212028503418, "logits/rejected": -5.279428958892822, "logps/chosen": -9135.490234375, "logps/rejected": -12597.8125, "loss": -50.1049, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -8817.939453125, "rewards/margins": 3399.02978515625, "rewards/rejected": -12216.966796875, "step": 4660 }, { "epoch": 0.9165848871442591, "grad_norm": 50363.97889842184, "learning_rate": 1.0525378703114401e-08, "logits/chosen": -5.336554527282715, "logits/rejected": -5.239108085632324, "logps/chosen": -11784.8173828125, "logps/rejected": -8329.6689453125, "loss": -9.3569, "rewards/accuracies": 0.2666666805744171, "rewards/chosen": -11573.857421875, "rewards/margins": -3434.34228515625, "rewards/rejected": -8139.515625, "step": 4670 }, { "epoch": 0.9185475956820413, "grad_norm": 39047.25542819554, "learning_rate": 1.0039197255214238e-08, "logits/chosen": -5.403003215789795, "logits/rejected": -5.482144355773926, "logps/chosen": -6678.1201171875, "logps/rejected": -10975.4296875, "loss": -52.0815, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -6489.85009765625, "rewards/margins": 4254.13720703125, "rewards/rejected": -10743.9873046875, "step": 4680 }, { "epoch": 0.9205103042198234, "grad_norm": 44070.458092805406, "learning_rate": 9.564281545974661e-09, "logits/chosen": -5.617677688598633, "logits/rejected": -5.28604793548584, "logps/chosen": -5524.7177734375, "logps/rejected": -11097.7353515625, "loss": -52.3076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5266.83740234375, "rewards/margins": 5581.02587890625, "rewards/rejected": -10847.86328125, "step": 4690 }, { "epoch": 0.9224730127576055, "grad_norm": 32081.533671915462, "learning_rate": 9.100653871854963e-09, "logits/chosen": -5.469379901885986, "logits/rejected": -5.81614351272583, "logps/chosen": -7464.33203125, "logps/rejected": -19654.55859375, "loss": -81.046, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -7168.33984375, "rewards/margins": 12197.1083984375, "rewards/rejected": -19365.44921875, "step": 4700 }, { "epoch": 0.9244357212953876, "grad_norm": 12340.861955538485, "learning_rate": 8.648335999360934e-09, "logits/chosen": -5.612882614135742, "logits/rejected": -5.583306789398193, "logps/chosen": -6359.9296875, "logps/rejected": -10216.705078125, "loss": -49.2214, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -6098.6728515625, "rewards/margins": 3908.253173828125, "rewards/rejected": -10006.92578125, "step": 4710 }, { "epoch": 0.9263984298331698, "grad_norm": 11719.983292456409, "learning_rate": 8.207349164023047e-09, "logits/chosen": -5.638997554779053, "logits/rejected": -5.38157320022583, "logps/chosen": -6565.74609375, "logps/rejected": -11651.9853515625, "loss": -49.7981, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -6301.10595703125, "rewards/margins": 5089.6650390625, "rewards/rejected": -11390.771484375, "step": 4720 }, { "epoch": 0.9283611383709519, "grad_norm": 42100.73117646423, "learning_rate": 7.777714069399532e-09, "logits/chosen": -5.442720890045166, "logits/rejected": -5.505597114562988, "logps/chosen": -11182.3046875, "logps/rejected": -10622.845703125, "loss": -49.4186, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -10909.029296875, "rewards/margins": -540.8515625, "rewards/rejected": -10368.173828125, "step": 4730 }, { "epoch": 0.930323846908734, "grad_norm": 30412.501899711497, "learning_rate": 7.359450886104263e-09, "logits/chosen": -5.728358268737793, "logits/rejected": -5.670831680297852, "logps/chosen": -6310.69775390625, "logps/rejected": -13372.705078125, "loss": -28.2274, "rewards/accuracies": 0.76666659116745, "rewards/chosen": -6012.96728515625, "rewards/margins": 7116.56884765625, "rewards/rejected": -13129.5361328125, "step": 4740 }, { "epoch": 0.9322865554465162, "grad_norm": 50616.841931103976, "learning_rate": 6.9525792508597634e-09, "logits/chosen": -5.438574314117432, "logits/rejected": -5.316531181335449, "logps/chosen": -9475.669921875, "logps/rejected": -13283.732421875, "loss": -70.1361, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9186.7080078125, "rewards/margins": 3810.594970703125, "rewards/rejected": -12997.302734375, "step": 4750 }, { "epoch": 0.9342492639842983, "grad_norm": 56509.6988599394, "learning_rate": 6.557118265575451e-09, "logits/chosen": -5.123375415802002, "logits/rejected": -5.459023475646973, "logps/chosen": -5494.5908203125, "logps/rejected": -15566.390625, "loss": -57.7777, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -5189.42138671875, "rewards/margins": 10101.0009765625, "rewards/rejected": -15290.421875, "step": 4760 }, { "epoch": 0.9362119725220804, "grad_norm": 10388.31779205407, "learning_rate": 6.1730864964507636e-09, "logits/chosen": -5.652479648590088, "logits/rejected": -6.010986328125, "logps/chosen": -8632.37109375, "logps/rejected": -13256.7470703125, "loss": -63.4052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8335.6015625, "rewards/margins": 4681.97705078125, "rewards/rejected": -13017.578125, "step": 4770 }, { "epoch": 0.9381746810598626, "grad_norm": 25259.61543930017, "learning_rate": 5.8005019731033615e-09, "logits/chosen": -5.1255035400390625, "logits/rejected": -5.424807548522949, "logps/chosen": -4975.4345703125, "logps/rejected": -9264.630859375, "loss": -46.7831, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -4695.54541015625, "rewards/margins": 4345.7470703125, "rewards/rejected": -9041.2919921875, "step": 4780 }, { "epoch": 0.9401373895976447, "grad_norm": 8104.142923477791, "learning_rate": 5.439382187722968e-09, "logits/chosen": -5.003173351287842, "logits/rejected": -5.18607234954834, "logps/chosen": -5931.4306640625, "logps/rejected": -11581.56640625, "loss": -55.1294, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5548.064453125, "rewards/margins": 5747.18017578125, "rewards/rejected": -11295.2431640625, "step": 4790 }, { "epoch": 0.9421000981354269, "grad_norm": 12251.105521415786, "learning_rate": 5.089744094249837e-09, "logits/chosen": -5.336795806884766, "logits/rejected": -5.199450969696045, "logps/chosen": -13796.9013671875, "logps/rejected": -15510.0205078125, "loss": -37.3836, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -13426.984375, "rewards/margins": 1818.8597412109375, "rewards/rejected": -15245.84375, "step": 4800 }, { "epoch": 0.9440628066732091, "grad_norm": 15955.510539630171, "learning_rate": 4.751604107579077e-09, "logits/chosen": -5.523677825927734, "logits/rejected": -5.780304908752441, "logps/chosen": -7601.390625, "logps/rejected": -14946.6591796875, "loss": -59.4672, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -7317.7890625, "rewards/margins": 7382.2666015625, "rewards/rejected": -14700.056640625, "step": 4810 }, { "epoch": 0.9460255152109912, "grad_norm": 56598.077915730704, "learning_rate": 4.424978102789661e-09, "logits/chosen": -5.542874336242676, "logits/rejected": -5.733336448669434, "logps/chosen": -6992.5673828125, "logps/rejected": -12525.44140625, "loss": -46.5405, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -6586.0419921875, "rewards/margins": 5687.77734375, "rewards/rejected": -12273.8203125, "step": 4820 }, { "epoch": 0.9479882237487733, "grad_norm": 36415.09568570072, "learning_rate": 4.109881414399524e-09, "logits/chosen": -5.343423843383789, "logits/rejected": -5.48567008972168, "logps/chosen": -7101.3203125, "logps/rejected": -11046.9111328125, "loss": -50.3575, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -6822.6845703125, "rewards/margins": 3943.21435546875, "rewards/rejected": -10765.8994140625, "step": 4830 }, { "epoch": 0.9499509322865555, "grad_norm": 55931.23957464586, "learning_rate": 3.806328835645272e-09, "logits/chosen": -5.090592861175537, "logits/rejected": -5.265051364898682, "logps/chosen": -5237.98876953125, "logps/rejected": -10700.7080078125, "loss": -59.1953, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -4996.5986328125, "rewards/margins": 5474.8076171875, "rewards/rejected": -10471.40625, "step": 4840 }, { "epoch": 0.9519136408243376, "grad_norm": 34103.085788176366, "learning_rate": 3.5143346177878565e-09, "logits/chosen": -5.03296422958374, "logits/rejected": -5.475438594818115, "logps/chosen": -9065.857421875, "logps/rejected": -18007.5234375, "loss": -43.1971, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -8696.8427734375, "rewards/margins": 9040.8427734375, "rewards/rejected": -17737.6875, "step": 4850 }, { "epoch": 0.9538763493621197, "grad_norm": 20634.451363294414, "learning_rate": 3.233912469443545e-09, "logits/chosen": -5.024641036987305, "logits/rejected": -4.951289176940918, "logps/chosen": -8460.7998046875, "logps/rejected": -7922.67822265625, "loss": -16.9978, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -8133.03662109375, "rewards/margins": -429.95556640625, "rewards/rejected": -7703.0810546875, "step": 4860 }, { "epoch": 0.9558390578999019, "grad_norm": 28282.062936966267, "learning_rate": 2.9650755559401388e-09, "logits/chosen": -5.4345316886901855, "logits/rejected": -5.402246475219727, "logps/chosen": -9350.966796875, "logps/rejected": -14678.5625, "loss": -54.6854, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9004.0517578125, "rewards/margins": 5360.6943359375, "rewards/rejected": -14364.7470703125, "step": 4870 }, { "epoch": 0.957801766437684, "grad_norm": 9406.707479131217, "learning_rate": 2.7078364986990175e-09, "logits/chosen": -5.931756973266602, "logits/rejected": -5.564810276031494, "logps/chosen": -13038.3798828125, "logps/rejected": -16408.6875, "loss": -30.5162, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -12642.640625, "rewards/margins": 3456.17431640625, "rewards/rejected": -16098.8154296875, "step": 4880 }, { "epoch": 0.9597644749754661, "grad_norm": 36857.299894620715, "learning_rate": 2.4622073746426165e-09, "logits/chosen": -5.324545860290527, "logits/rejected": -5.297689914703369, "logps/chosen": -4729.51171875, "logps/rejected": -12161.0888671875, "loss": -33.2457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4455.60400390625, "rewards/margins": 7480.34912109375, "rewards/rejected": -11935.953125, "step": 4890 }, { "epoch": 0.9617271835132483, "grad_norm": 48730.25832168058, "learning_rate": 2.2281997156273213e-09, "logits/chosen": -5.6009392738342285, "logits/rejected": -5.509004592895508, "logps/chosen": -7457.6572265625, "logps/rejected": -12534.080078125, "loss": -29.9732, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -7145.31103515625, "rewards/margins": 5130.0791015625, "rewards/rejected": -12275.3896484375, "step": 4900 }, { "epoch": 0.9636898920510304, "grad_norm": 9985.545896526964, "learning_rate": 2.0058245079021265e-09, "logits/chosen": -5.291119575500488, "logits/rejected": -5.261479377746582, "logps/chosen": -6802.4296875, "logps/rejected": -8649.771484375, "loss": -38.2758, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -6542.68408203125, "rewards/margins": 1905.0712890625, "rewards/rejected": -8447.755859375, "step": 4910 }, { "epoch": 0.9656526005888125, "grad_norm": 38492.49943614459, "learning_rate": 1.7950921915928784e-09, "logits/chosen": -5.766026020050049, "logits/rejected": -5.571219444274902, "logps/chosen": -7481.9970703125, "logps/rejected": -11754.462890625, "loss": -54.5138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7216.64599609375, "rewards/margins": 4326.9111328125, "rewards/rejected": -11543.5576171875, "step": 4920 }, { "epoch": 0.9676153091265947, "grad_norm": 10926.195189809292, "learning_rate": 1.596012660212087e-09, "logits/chosen": -5.467657089233398, "logits/rejected": -5.737392902374268, "logps/chosen": -9283.4228515625, "logps/rejected": -14037.265625, "loss": -48.1153, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -8964.728515625, "rewards/margins": 4853.01708984375, "rewards/rejected": -13817.7470703125, "step": 4930 }, { "epoch": 0.9695780176643768, "grad_norm": 16580.960786165768, "learning_rate": 1.408595260194434e-09, "logits/chosen": -5.644110679626465, "logits/rejected": -5.414385795593262, "logps/chosen": -9681.896484375, "logps/rejected": -8910.7099609375, "loss": -17.6438, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": -9366.7763671875, "rewards/margins": -657.1585693359375, "rewards/rejected": -8709.619140625, "step": 4940 }, { "epoch": 0.971540726202159, "grad_norm": 15154.641789535359, "learning_rate": 1.2328487904580131e-09, "logits/chosen": -5.1629438400268555, "logits/rejected": -5.696649074554443, "logps/chosen": -6506.72021484375, "logps/rejected": -11326.44921875, "loss": -35.4168, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -6292.8994140625, "rewards/margins": 4814.1826171875, "rewards/rejected": -11107.0830078125, "step": 4950 }, { "epoch": 0.9735034347399412, "grad_norm": 23521.77590995972, "learning_rate": 1.0687815019912173e-09, "logits/chosen": -6.030200004577637, "logits/rejected": -5.276554107666016, "logps/chosen": -7984.73828125, "logps/rejected": -12041.146484375, "loss": -53.3227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7704.26416015625, "rewards/margins": 4005.686279296875, "rewards/rejected": -11709.951171875, "step": 4960 }, { "epoch": 0.9754661432777233, "grad_norm": 30184.052858608244, "learning_rate": 9.164010974653802e-10, "logits/chosen": -5.343989372253418, "logits/rejected": -5.537137985229492, "logps/chosen": -6235.22265625, "logps/rejected": -15414.9404296875, "loss": -49.7714, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -5967.2841796875, "rewards/margins": 9175.1005859375, "rewards/rejected": -15142.3857421875, "step": 4970 }, { "epoch": 0.9774288518155054, "grad_norm": 75633.11479388778, "learning_rate": 7.757147308731504e-10, "logits/chosen": -5.246553897857666, "logits/rejected": -5.361756324768066, "logps/chosen": -8798.314453125, "logps/rejected": -11697.5380859375, "loss": -37.8348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8490.5078125, "rewards/margins": 2937.796142578125, "rewards/rejected": -11428.302734375, "step": 4980 }, { "epoch": 0.9793915603532876, "grad_norm": 9492.132423005378, "learning_rate": 6.467290071925646e-10, "logits/chosen": -5.757071018218994, "logits/rejected": -5.482789993286133, "logps/chosen": -7432.92333984375, "logps/rejected": -10776.72265625, "loss": -44.6838, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -7218.9287109375, "rewards/margins": 3335.837158203125, "rewards/rejected": -10554.765625, "step": 4990 }, { "epoch": 0.9813542688910697, "grad_norm": 33228.30306147814, "learning_rate": 5.29449982077046e-10, "logits/chosen": -5.681429862976074, "logits/rejected": -5.440320014953613, "logps/chosen": -9965.1376953125, "logps/rejected": -11100.287109375, "loss": -32.2383, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -9680.611328125, "rewards/margins": 1205.5638427734375, "rewards/rejected": -10886.173828125, "step": 5000 }, { "epoch": 0.9833169774288518, "grad_norm": 23405.38870952273, "learning_rate": 4.2388316157104806e-10, "logits/chosen": -5.696514129638672, "logits/rejected": -5.184239387512207, "logps/chosen": -6139.44287109375, "logps/rejected": -9225.087890625, "loss": -51.2489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5871.55078125, "rewards/margins": 3132.18408203125, "rewards/rejected": -9003.734375, "step": 5010 }, { "epoch": 0.985279685966634, "grad_norm": 18062.597671334774, "learning_rate": 3.300335018515676e-10, "logits/chosen": -5.490758419036865, "logits/rejected": -5.224146842956543, "logps/chosen": -5866.4052734375, "logps/rejected": -9715.783203125, "loss": -47.3662, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -5652.626953125, "rewards/margins": 3898.374267578125, "rewards/rejected": -9551.001953125, "step": 5020 }, { "epoch": 0.9872423945044161, "grad_norm": 37027.413775786474, "learning_rate": 2.4790540899546907e-10, "logits/chosen": -5.474011421203613, "logits/rejected": -5.447179317474365, "logps/chosen": -4271.74560546875, "logps/rejected": -11613.865234375, "loss": -70.6022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4049.6953125, "rewards/margins": 7288.49462890625, "rewards/rejected": -11338.189453125, "step": 5030 }, { "epoch": 0.9892051030421982, "grad_norm": 16898.663549016208, "learning_rate": 1.7750273877262244e-10, "logits/chosen": -4.986571788787842, "logits/rejected": -5.457891464233398, "logps/chosen": -8416.396484375, "logps/rejected": -13331.349609375, "loss": -65.6238, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -8128.3017578125, "rewards/margins": 4941.771484375, "rewards/rejected": -13070.0732421875, "step": 5040 }, { "epoch": 0.9911678115799804, "grad_norm": 36736.860384580476, "learning_rate": 1.1882879646485379e-10, "logits/chosen": -5.727559566497803, "logits/rejected": -5.958432197570801, "logps/chosen": -5797.2978515625, "logps/rejected": -15178.669921875, "loss": -47.0752, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -5572.09521484375, "rewards/margins": 9390.853515625, "rewards/rejected": -14962.947265625, "step": 5050 }, { "epoch": 0.9931305201177625, "grad_norm": 25966.84483639237, "learning_rate": 7.188633671079136e-11, "logits/chosen": -5.941857814788818, "logits/rejected": -5.766529083251953, "logps/chosen": -8062.7119140625, "logps/rejected": -10428.0439453125, "loss": -35.896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7774.5048828125, "rewards/margins": 2443.75048828125, "rewards/rejected": -10218.25390625, "step": 5060 }, { "epoch": 0.9950932286555446, "grad_norm": 43698.80343593512, "learning_rate": 3.6677563376580344e-11, "logits/chosen": -5.737082481384277, "logits/rejected": -5.714974403381348, "logps/chosen": -10501.037109375, "logps/rejected": -16650.15234375, "loss": -56.6408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -10236.2158203125, "rewards/margins": 6103.4013671875, "rewards/rejected": -16339.615234375, "step": 5070 }, { "epoch": 0.9970559371933267, "grad_norm": 13565.993098132318, "learning_rate": 1.3204129452354385e-11, "logits/chosen": -5.741917133331299, "logits/rejected": -5.2679948806762695, "logps/chosen": -9109.6845703125, "logps/rejected": -12618.1591796875, "loss": -34.0468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8838.6025390625, "rewards/margins": 3494.30517578125, "rewards/rejected": -12332.908203125, "step": 5080 }, { "epoch": 0.9990186457311089, "grad_norm": 26074.68459495198, "learning_rate": 1.467136974631078e-12, "logits/chosen": -5.634035587310791, "logits/rejected": -5.727025032043457, "logps/chosen": -9727.283203125, "logps/rejected": -10950.7529296875, "loss": -53.1696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9455.361328125, "rewards/margins": 1293.5570068359375, "rewards/rejected": -10748.91796875, "step": 5090 }, { "epoch": 1.0, "step": 5095, "total_flos": 0.0, "train_loss": -34.73410764547637, "train_runtime": 17646.3029, "train_samples_per_second": 3.464, "train_steps_per_second": 0.289 } ], "logging_steps": 10, "max_steps": 5095, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }