diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7692 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5095, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019627085377821394, + "grad_norm": 7695.078157716598, + "learning_rate": 9.803921568627451e-10, + "logits/chosen": -2.9195547103881836, + "logits/rejected": -2.4565553665161133, + "logps/chosen": -421.782470703125, + "logps/rejected": -89.33955383300781, + "loss": 499.7888, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.001962708537782139, + "grad_norm": 8523.861999496248, + "learning_rate": 9.803921568627451e-09, + "logits/chosen": -2.5579118728637695, + "logits/rejected": -2.5539872646331787, + "logps/chosen": -328.5361633300781, + "logps/rejected": -224.728515625, + "loss": 500.0604, + "rewards/accuracies": 0.37037035822868347, + "rewards/chosen": -0.02532227709889412, + "rewards/margins": -0.12882067263126373, + "rewards/rejected": 0.10349839180707932, + "step": 10 + }, + { + "epoch": 0.003925417075564278, + "grad_norm": 8141.883083281547, + "learning_rate": 1.9607843137254902e-08, + "logits/chosen": -2.748523473739624, + "logits/rejected": -2.6494884490966797, + "logps/chosen": -241.36862182617188, + "logps/rejected": -228.7290802001953, + "loss": 466.6132, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.20755431056022644, + "rewards/margins": 0.09448665380477905, + "rewards/rejected": 0.11306764930486679, + "step": 20 + }, + { + "epoch": 0.005888125613346418, + "grad_norm": 9089.169358641317, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -2.805922031402588, + "logits/rejected": -2.7502973079681396, + "logps/chosen": -271.3504333496094, + "logps/rejected": -276.63763427734375, + "loss": 573.6512, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.38350290060043335, + "rewards/margins": -0.16553720831871033, + "rewards/rejected": 0.5490401387214661, + "step": 30 + }, + { + "epoch": 0.007850834151128557, + "grad_norm": 8417.60796163703, + "learning_rate": 3.9215686274509804e-08, + "logits/chosen": -2.5296969413757324, + "logits/rejected": -2.614142894744873, + "logps/chosen": -234.57723999023438, + "logps/rejected": -197.72872924804688, + "loss": 566.4437, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.5821893215179443, + "rewards/margins": 0.21359722316265106, + "rewards/rejected": 1.3685922622680664, + "step": 40 + }, + { + "epoch": 0.009813542688910697, + "grad_norm": 6327.552834890516, + "learning_rate": 4.901960784313725e-08, + "logits/chosen": -2.7662551403045654, + "logits/rejected": -2.7321105003356934, + "logps/chosen": -261.9884948730469, + "logps/rejected": -280.5721435546875, + "loss": 567.1037, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 3.4329071044921875, + "rewards/margins": 0.40464481711387634, + "rewards/rejected": 3.0282623767852783, + "step": 50 + }, + { + "epoch": 0.011776251226692836, + "grad_norm": 5972.253584813363, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -2.759847640991211, + "logits/rejected": -2.6784212589263916, + "logps/chosen": -249.282470703125, + "logps/rejected": -230.25588989257812, + "loss": 525.8602, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 5.610996246337891, + "rewards/margins": -0.2631208300590515, + "rewards/rejected": 5.874117374420166, + "step": 60 + }, + { + "epoch": 0.013738959764474975, + "grad_norm": 5990.262385424046, + "learning_rate": 6.862745098039216e-08, + "logits/chosen": -2.846557140350342, + "logits/rejected": -2.780174493789673, + "logps/chosen": -294.8406677246094, + "logps/rejected": -229.6591033935547, + "loss": 501.9, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 7.265566825866699, + "rewards/margins": -0.3561328053474426, + "rewards/rejected": 7.621700286865234, + "step": 70 + }, + { + "epoch": 0.015701668302257114, + "grad_norm": 5796.982235405843, + "learning_rate": 7.843137254901961e-08, + "logits/chosen": -2.7839438915252686, + "logits/rejected": -2.595407009124756, + "logps/chosen": -315.75201416015625, + "logps/rejected": -205.21286010742188, + "loss": 535.338, + "rewards/accuracies": 0.5, + "rewards/chosen": 8.52326488494873, + "rewards/margins": 0.2876408100128174, + "rewards/rejected": 8.235624313354492, + "step": 80 + }, + { + "epoch": 0.017664376840039256, + "grad_norm": 5927.090334614455, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -2.844291925430298, + "logits/rejected": -2.818943738937378, + "logps/chosen": -259.13348388671875, + "logps/rejected": -261.32122802734375, + "loss": 555.3117, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 11.018460273742676, + "rewards/margins": -0.5622838139533997, + "rewards/rejected": 11.580743789672852, + "step": 90 + }, + { + "epoch": 0.019627085377821395, + "grad_norm": 4923.734511273913, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -2.8124048709869385, + "logits/rejected": -2.730870246887207, + "logps/chosen": -284.17413330078125, + "logps/rejected": -248.091064453125, + "loss": 487.075, + "rewards/accuracies": 0.29999998211860657, + "rewards/chosen": 11.848287582397461, + "rewards/margins": 0.02657313272356987, + "rewards/rejected": 11.821714401245117, + "step": 100 + }, + { + "epoch": 0.021589793915603533, + "grad_norm": 5710.345549001755, + "learning_rate": 1.0784313725490195e-07, + "logits/chosen": -2.841452121734619, + "logits/rejected": -2.760373592376709, + "logps/chosen": -322.677978515625, + "logps/rejected": -283.0740661621094, + "loss": 496.9658, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 10.059597969055176, + "rewards/margins": 0.1907721310853958, + "rewards/rejected": 9.868825912475586, + "step": 110 + }, + { + "epoch": 0.023552502453385672, + "grad_norm": 4267.1269607205095, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -2.6766438484191895, + "logits/rejected": -2.60054349899292, + "logps/chosen": -189.0832977294922, + "logps/rejected": -173.5619659423828, + "loss": 500.976, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 13.830352783203125, + "rewards/margins": -1.0408800840377808, + "rewards/rejected": 14.871232986450195, + "step": 120 + }, + { + "epoch": 0.02551521099116781, + "grad_norm": 5055.7057004203425, + "learning_rate": 1.2745098039215685e-07, + "logits/chosen": -2.6325724124908447, + "logits/rejected": -2.6757912635803223, + "logps/chosen": -345.6778259277344, + "logps/rejected": -288.1807556152344, + "loss": 516.968, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 15.467287063598633, + "rewards/margins": 2.80692720413208, + "rewards/rejected": 12.660360336303711, + "step": 130 + }, + { + "epoch": 0.02747791952894995, + "grad_norm": 6694.84371329099, + "learning_rate": 1.3725490196078432e-07, + "logits/chosen": -2.712160587310791, + "logits/rejected": -2.756772518157959, + "logps/chosen": -179.55929565429688, + "logps/rejected": -178.046142578125, + "loss": 494.6539, + "rewards/accuracies": 0.36666667461395264, + "rewards/chosen": 13.633160591125488, + "rewards/margins": -1.888087272644043, + "rewards/rejected": 15.521249771118164, + "step": 140 + }, + { + "epoch": 0.029440628066732092, + "grad_norm": 5094.761864381693, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -2.8623595237731934, + "logits/rejected": -2.7546021938323975, + "logps/chosen": -219.88882446289062, + "logps/rejected": -209.93106079101562, + "loss": 487.1993, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 14.253957748413086, + "rewards/margins": -1.1866604089736938, + "rewards/rejected": 15.440618515014648, + "step": 150 + }, + { + "epoch": 0.03140333660451423, + "grad_norm": 5722.953728892541, + "learning_rate": 1.5686274509803921e-07, + "logits/chosen": -2.8009588718414307, + "logits/rejected": -2.7127015590667725, + "logps/chosen": -263.4642028808594, + "logps/rejected": -204.67913818359375, + "loss": 488.6207, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 15.075262069702148, + "rewards/margins": 1.5416971445083618, + "rewards/rejected": 13.533564567565918, + "step": 160 + }, + { + "epoch": 0.033366045142296366, + "grad_norm": 5406.103665715419, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -2.688260555267334, + "logits/rejected": -2.66151762008667, + "logps/chosen": -230.9227752685547, + "logps/rejected": -188.932861328125, + "loss": 486.579, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 17.17111587524414, + "rewards/margins": -0.39257413148880005, + "rewards/rejected": 17.563688278198242, + "step": 170 + }, + { + "epoch": 0.03532875368007851, + "grad_norm": 3607.492970015055, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -2.7789673805236816, + "logits/rejected": -2.679816722869873, + "logps/chosen": -242.88671875, + "logps/rejected": -227.7840118408203, + "loss": 481.6462, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 15.954030990600586, + "rewards/margins": -3.311750888824463, + "rewards/rejected": 19.265785217285156, + "step": 180 + }, + { + "epoch": 0.03729146221786065, + "grad_norm": 4808.623970063617, + "learning_rate": 1.8627450980392158e-07, + "logits/chosen": -2.6783928871154785, + "logits/rejected": -2.6424717903137207, + "logps/chosen": -300.4185791015625, + "logps/rejected": -239.74569702148438, + "loss": 496.5815, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 17.760433197021484, + "rewards/margins": 0.7568072080612183, + "rewards/rejected": 17.003625869750977, + "step": 190 + }, + { + "epoch": 0.03925417075564279, + "grad_norm": 8590.339480711116, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": -2.8148932456970215, + "logits/rejected": -2.617837429046631, + "logps/chosen": -262.127197265625, + "logps/rejected": -177.19729614257812, + "loss": 528.763, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": 24.703397750854492, + "rewards/margins": 5.610960960388184, + "rewards/rejected": 19.092435836791992, + "step": 200 + }, + { + "epoch": 0.04121687929342493, + "grad_norm": 4772.234179976811, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -2.571575164794922, + "logits/rejected": -2.488711357116699, + "logps/chosen": -224.4163055419922, + "logps/rejected": -219.7928466796875, + "loss": 491.4365, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 17.970706939697266, + "rewards/margins": 2.292705535888672, + "rewards/rejected": 15.678001403808594, + "step": 210 + }, + { + "epoch": 0.04317958783120707, + "grad_norm": 4457.756320569655, + "learning_rate": 2.156862745098039e-07, + "logits/chosen": -2.77001953125, + "logits/rejected": -2.7243027687072754, + "logps/chosen": -270.52069091796875, + "logps/rejected": -267.315185546875, + "loss": 548.6979, + "rewards/accuracies": 0.5, + "rewards/chosen": 17.678638458251953, + "rewards/margins": -0.4655752182006836, + "rewards/rejected": 18.14421272277832, + "step": 220 + }, + { + "epoch": 0.045142296368989206, + "grad_norm": 4436.791958749799, + "learning_rate": 2.2549019607843137e-07, + "logits/chosen": -2.8024957180023193, + "logits/rejected": -2.660709857940674, + "logps/chosen": -246.68179321289062, + "logps/rejected": -183.47801208496094, + "loss": 458.2289, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 20.225656509399414, + "rewards/margins": 1.1780885457992554, + "rewards/rejected": 19.04756736755371, + "step": 230 + }, + { + "epoch": 0.047105004906771344, + "grad_norm": 4851.214394709931, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -2.688019037246704, + "logits/rejected": -2.6779627799987793, + "logps/chosen": -230.35806274414062, + "logps/rejected": -223.46469116210938, + "loss": 490.6466, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 19.589611053466797, + "rewards/margins": -4.361724853515625, + "rewards/rejected": 23.95133399963379, + "step": 240 + }, + { + "epoch": 0.04906771344455348, + "grad_norm": 4924.226168440959, + "learning_rate": 2.4509803921568627e-07, + "logits/chosen": -2.6655263900756836, + "logits/rejected": -2.59631609916687, + "logps/chosen": -213.60543823242188, + "logps/rejected": -182.390625, + "loss": 458.9436, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 25.82254409790039, + "rewards/margins": 5.193560600280762, + "rewards/rejected": 20.628982543945312, + "step": 250 + }, + { + "epoch": 0.05103042198233562, + "grad_norm": 5042.917008262602, + "learning_rate": 2.549019607843137e-07, + "logits/chosen": -2.750251293182373, + "logits/rejected": -2.7108583450317383, + "logps/chosen": -296.44635009765625, + "logps/rejected": -226.4813690185547, + "loss": 488.1532, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 21.304428100585938, + "rewards/margins": -1.4539859294891357, + "rewards/rejected": 22.7584171295166, + "step": 260 + }, + { + "epoch": 0.05299313052011776, + "grad_norm": 4890.209437898859, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -2.6184678077697754, + "logits/rejected": -2.6272358894348145, + "logps/chosen": -205.027587890625, + "logps/rejected": -197.83663940429688, + "loss": 401.3296, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 22.567365646362305, + "rewards/margins": -2.5471272468566895, + "rewards/rejected": 25.114490509033203, + "step": 270 + }, + { + "epoch": 0.0549558390578999, + "grad_norm": 4974.045072015982, + "learning_rate": 2.7450980392156863e-07, + "logits/chosen": -2.6787219047546387, + "logits/rejected": -2.6040050983428955, + "logps/chosen": -225.64871215820312, + "logps/rejected": -180.16539001464844, + "loss": 471.3332, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 19.779897689819336, + "rewards/margins": -1.9217135906219482, + "rewards/rejected": 21.70161247253418, + "step": 280 + }, + { + "epoch": 0.05691854759568204, + "grad_norm": 4075.0039747753326, + "learning_rate": 2.8431372549019607e-07, + "logits/chosen": -2.771533966064453, + "logits/rejected": -2.6332433223724365, + "logps/chosen": -279.80279541015625, + "logps/rejected": -208.13052368164062, + "loss": 477.8637, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 24.917438507080078, + "rewards/margins": 1.8436920642852783, + "rewards/rejected": 23.073745727539062, + "step": 290 + }, + { + "epoch": 0.058881256133464184, + "grad_norm": 4301.819357666736, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -2.626574754714966, + "logits/rejected": -2.628239154815674, + "logps/chosen": -259.4626770019531, + "logps/rejected": -275.0853576660156, + "loss": 458.7708, + "rewards/accuracies": 0.5, + "rewards/chosen": 23.829364776611328, + "rewards/margins": -1.2697381973266602, + "rewards/rejected": 25.099105834960938, + "step": 300 + }, + { + "epoch": 0.06084396467124632, + "grad_norm": 4704.80103510706, + "learning_rate": 3.0392156862745094e-07, + "logits/chosen": -2.675342082977295, + "logits/rejected": -2.5852386951446533, + "logps/chosen": -270.854248046875, + "logps/rejected": -212.65805053710938, + "loss": 508.6296, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 31.20000648498535, + "rewards/margins": 7.389936923980713, + "rewards/rejected": 23.81006622314453, + "step": 310 + }, + { + "epoch": 0.06280667320902845, + "grad_norm": 4797.39266587772, + "learning_rate": 3.1372549019607843e-07, + "logits/chosen": -2.653855085372925, + "logits/rejected": -2.6646554470062256, + "logps/chosen": -182.79727172851562, + "logps/rejected": -204.63052368164062, + "loss": 443.3549, + "rewards/accuracies": 0.533333420753479, + "rewards/chosen": 20.889680862426758, + "rewards/margins": 0.9416363835334778, + "rewards/rejected": 19.948043823242188, + "step": 320 + }, + { + "epoch": 0.0647693817468106, + "grad_norm": 3846.90384024348, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -2.6689724922180176, + "logits/rejected": -2.618972063064575, + "logps/chosen": -251.56640625, + "logps/rejected": -183.63084411621094, + "loss": 397.1196, + "rewards/accuracies": 0.533333420753479, + "rewards/chosen": 24.467573165893555, + "rewards/margins": -5.52662467956543, + "rewards/rejected": 29.994197845458984, + "step": 330 + }, + { + "epoch": 0.06673209028459273, + "grad_norm": 5086.669533999424, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -2.7471423149108887, + "logits/rejected": -2.619368076324463, + "logps/chosen": -320.1322326660156, + "logps/rejected": -214.53182983398438, + "loss": 511.8433, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 30.095443725585938, + "rewards/margins": 3.8112258911132812, + "rewards/rejected": 26.284221649169922, + "step": 340 + }, + { + "epoch": 0.06869479882237488, + "grad_norm": 4135.450239206523, + "learning_rate": 3.431372549019608e-07, + "logits/chosen": -2.6878678798675537, + "logits/rejected": -2.6312239170074463, + "logps/chosen": -170.13888549804688, + "logps/rejected": -156.8441619873047, + "loss": 457.6831, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 20.482637405395508, + "rewards/margins": 0.018645573407411575, + "rewards/rejected": 20.463991165161133, + "step": 350 + }, + { + "epoch": 0.07065750736015702, + "grad_norm": 4955.612877440178, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -2.7627711296081543, + "logits/rejected": -2.596731662750244, + "logps/chosen": -323.8910217285156, + "logps/rejected": -228.11819458007812, + "loss": 513.3576, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 26.47014808654785, + "rewards/margins": -1.49628746509552, + "rewards/rejected": 27.966434478759766, + "step": 360 + }, + { + "epoch": 0.07262021589793916, + "grad_norm": 5191.260127136411, + "learning_rate": 3.6274509803921566e-07, + "logits/chosen": -2.642749547958374, + "logits/rejected": -2.540039539337158, + "logps/chosen": -246.0485076904297, + "logps/rejected": -229.6261749267578, + "loss": 503.3668, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 27.6164608001709, + "rewards/margins": -5.55600643157959, + "rewards/rejected": 33.172462463378906, + "step": 370 + }, + { + "epoch": 0.0745829244357213, + "grad_norm": 4248.3126174245535, + "learning_rate": 3.7254901960784315e-07, + "logits/chosen": -2.4983344078063965, + "logits/rejected": -2.6608455181121826, + "logps/chosen": -216.2776641845703, + "logps/rejected": -261.11383056640625, + "loss": 463.8114, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 23.812610626220703, + "rewards/margins": -7.91140079498291, + "rewards/rejected": 31.724010467529297, + "step": 380 + }, + { + "epoch": 0.07654563297350343, + "grad_norm": 4560.638704744429, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -2.541658401489258, + "logits/rejected": -2.384592056274414, + "logps/chosen": -243.137451171875, + "logps/rejected": -242.7369842529297, + "loss": 505.7781, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 23.56881332397461, + "rewards/margins": -12.386117935180664, + "rewards/rejected": 35.95492935180664, + "step": 390 + }, + { + "epoch": 0.07850834151128558, + "grad_norm": 5066.806916895327, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": -2.685091495513916, + "logits/rejected": -2.5697619915008545, + "logps/chosen": -234.70156860351562, + "logps/rejected": -246.5504913330078, + "loss": 503.3179, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 29.678075790405273, + "rewards/margins": 0.47646698355674744, + "rewards/rejected": 29.201608657836914, + "step": 400 + }, + { + "epoch": 0.08047105004906771, + "grad_norm": 5227.403614179651, + "learning_rate": 4.019607843137255e-07, + "logits/chosen": -2.6153388023376465, + "logits/rejected": -2.6428751945495605, + "logps/chosen": -271.0966491699219, + "logps/rejected": -225.4984588623047, + "loss": 464.8479, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 30.264745712280273, + "rewards/margins": 2.1848247051239014, + "rewards/rejected": 28.079919815063477, + "step": 410 + }, + { + "epoch": 0.08243375858684986, + "grad_norm": 4868.874765478459, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -2.6614885330200195, + "logits/rejected": -2.6459619998931885, + "logps/chosen": -231.1891632080078, + "logps/rejected": -233.40103149414062, + "loss": 489.6508, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 29.440465927124023, + "rewards/margins": -13.364405632019043, + "rewards/rejected": 42.80486297607422, + "step": 420 + }, + { + "epoch": 0.08439646712463199, + "grad_norm": 6272.321633430104, + "learning_rate": 4.215686274509804e-07, + "logits/chosen": -2.8335399627685547, + "logits/rejected": -2.644537925720215, + "logps/chosen": -331.88970947265625, + "logps/rejected": -218.96865844726562, + "loss": 438.3674, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 32.342071533203125, + "rewards/margins": 11.023529052734375, + "rewards/rejected": 21.31854248046875, + "step": 430 + }, + { + "epoch": 0.08635917566241413, + "grad_norm": 3468.7372394369486, + "learning_rate": 4.313725490196078e-07, + "logits/chosen": -2.6140809059143066, + "logits/rejected": -2.529378890991211, + "logps/chosen": -263.18231201171875, + "logps/rejected": -210.88919067382812, + "loss": 469.5156, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 36.2723274230957, + "rewards/margins": 4.149384498596191, + "rewards/rejected": 32.12294006347656, + "step": 440 + }, + { + "epoch": 0.08832188420019627, + "grad_norm": 4602.551772130593, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -2.7470650672912598, + "logits/rejected": -2.8349151611328125, + "logps/chosen": -255.5126190185547, + "logps/rejected": -249.41067504882812, + "loss": 470.7945, + "rewards/accuracies": 0.3333333134651184, + "rewards/chosen": 25.42453384399414, + "rewards/margins": -14.558160781860352, + "rewards/rejected": 39.982696533203125, + "step": 450 + }, + { + "epoch": 0.09028459273797841, + "grad_norm": 4894.576240603985, + "learning_rate": 4.5098039215686274e-07, + "logits/chosen": -2.554935932159424, + "logits/rejected": -2.4666876792907715, + "logps/chosen": -220.52279663085938, + "logps/rejected": -231.3734130859375, + "loss": 472.8678, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 28.813745498657227, + "rewards/margins": 2.0909621715545654, + "rewards/rejected": 26.7227840423584, + "step": 460 + }, + { + "epoch": 0.09224730127576054, + "grad_norm": 4738.262801513381, + "learning_rate": 4.6078431372549013e-07, + "logits/chosen": -2.427926540374756, + "logits/rejected": -2.3819785118103027, + "logps/chosen": -237.2082061767578, + "logps/rejected": -274.40142822265625, + "loss": 489.6224, + "rewards/accuracies": 0.36666664481163025, + "rewards/chosen": 26.397693634033203, + "rewards/margins": -35.425819396972656, + "rewards/rejected": 61.823509216308594, + "step": 470 + }, + { + "epoch": 0.09421000981354269, + "grad_norm": 4975.613495131456, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -2.7603957653045654, + "logits/rejected": -2.664858341217041, + "logps/chosen": -287.56280517578125, + "logps/rejected": -292.58740234375, + "loss": 515.9132, + "rewards/accuracies": 0.5, + "rewards/chosen": 32.57997512817383, + "rewards/margins": 2.327850580215454, + "rewards/rejected": 30.252126693725586, + "step": 480 + }, + { + "epoch": 0.09617271835132483, + "grad_norm": 2980.3710795452093, + "learning_rate": 4.803921568627451e-07, + "logits/chosen": -2.433875560760498, + "logits/rejected": -2.4270520210266113, + "logps/chosen": -269.6871337890625, + "logps/rejected": -286.18511962890625, + "loss": 490.2408, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 24.98427963256836, + "rewards/margins": -10.737724304199219, + "rewards/rejected": 35.72200393676758, + "step": 490 + }, + { + "epoch": 0.09813542688910697, + "grad_norm": 4577.218197831634, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -2.6620137691497803, + "logits/rejected": -2.624298334121704, + "logps/chosen": -268.84979248046875, + "logps/rejected": -228.1798553466797, + "loss": 428.5641, + "rewards/accuracies": 0.5, + "rewards/chosen": 30.748973846435547, + "rewards/margins": 8.2810697555542, + "rewards/rejected": 22.46790313720703, + "step": 500 + }, + { + "epoch": 0.10009813542688911, + "grad_norm": 2974.4950215344847, + "learning_rate": 5e-07, + "logits/chosen": -2.559434175491333, + "logits/rejected": -2.4876298904418945, + "logps/chosen": -249.40090942382812, + "logps/rejected": -241.1505126953125, + "loss": 448.1437, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 38.660858154296875, + "rewards/margins": 7.0601806640625, + "rewards/rejected": 31.60067367553711, + "step": 510 + }, + { + "epoch": 0.10206084396467124, + "grad_norm": 4350.403609600577, + "learning_rate": 4.999941314693213e-07, + "logits/chosen": -2.574789524078369, + "logits/rejected": -2.550934314727783, + "logps/chosen": -218.6322784423828, + "logps/rejected": -173.77525329589844, + "loss": 460.1616, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 31.011077880859375, + "rewards/margins": -1.9973556995391846, + "rewards/rejected": 33.00843811035156, + "step": 520 + }, + { + "epoch": 0.10402355250245339, + "grad_norm": 5025.346211421526, + "learning_rate": 4.999765261528027e-07, + "logits/chosen": -2.609151601791382, + "logits/rejected": -2.6930503845214844, + "logps/chosen": -262.9989013671875, + "logps/rejected": -280.50274658203125, + "loss": 485.6344, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 26.35666847229004, + "rewards/margins": -9.358552932739258, + "rewards/rejected": 35.71521759033203, + "step": 530 + }, + { + "epoch": 0.10598626104023552, + "grad_norm": 4795.300896477626, + "learning_rate": 4.999471848769828e-07, + "logits/chosen": -2.504603385925293, + "logits/rejected": -2.509408950805664, + "logps/chosen": -260.916259765625, + "logps/rejected": -287.2959899902344, + "loss": 465.1678, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 29.1888427734375, + "rewards/margins": 2.88441801071167, + "rewards/rejected": 26.304424285888672, + "step": 540 + }, + { + "epoch": 0.10794896957801767, + "grad_norm": 5291.830653761251, + "learning_rate": 4.999061090193831e-07, + "logits/chosen": -2.7169575691223145, + "logits/rejected": -2.574615001678467, + "logps/chosen": -286.2722473144531, + "logps/rejected": -274.84759521484375, + "loss": 531.3509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 36.53556442260742, + "rewards/margins": 9.655365943908691, + "rewards/rejected": 26.880207061767578, + "step": 550 + }, + { + "epoch": 0.1099116781157998, + "grad_norm": 4294.277561666831, + "learning_rate": 4.998533005084428e-07, + "logits/chosen": -2.6560511589050293, + "logits/rejected": -2.673675298690796, + "logps/chosen": -254.6729278564453, + "logps/rejected": -222.37979125976562, + "loss": 424.8929, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 26.852008819580078, + "rewards/margins": -2.5513346195220947, + "rewards/rejected": 29.40334129333496, + "step": 560 + }, + { + "epoch": 0.11187438665358194, + "grad_norm": 4013.8424370724483, + "learning_rate": 4.997887618234292e-07, + "logits/chosen": -2.606541872024536, + "logits/rejected": -2.682185649871826, + "logps/chosen": -250.31546020507812, + "logps/rejected": -274.77874755859375, + "loss": 506.972, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 31.33298683166504, + "rewards/margins": -2.6476120948791504, + "rewards/rejected": 33.9806022644043, + "step": 570 + }, + { + "epoch": 0.11383709519136408, + "grad_norm": 3840.5056559095287, + "learning_rate": 4.997124959943201e-07, + "logits/chosen": -2.7121341228485107, + "logits/rejected": -2.5332818031311035, + "logps/chosen": -215.18496704101562, + "logps/rejected": -189.94198608398438, + "loss": 431.4372, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 28.723220825195312, + "rewards/margins": 5.397341251373291, + "rewards/rejected": 23.325878143310547, + "step": 580 + }, + { + "epoch": 0.11579980372914622, + "grad_norm": 3941.421650843535, + "learning_rate": 4.996245066016623e-07, + "logits/chosen": -2.670100688934326, + "logits/rejected": -2.584319591522217, + "logps/chosen": -223.4555206298828, + "logps/rejected": -204.05929565429688, + "loss": 347.5933, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 31.671985626220703, + "rewards/margins": 4.035216331481934, + "rewards/rejected": 27.636768341064453, + "step": 590 + }, + { + "epoch": 0.11776251226692837, + "grad_norm": 5609.191364022628, + "learning_rate": 4.995247977764035e-07, + "logits/chosen": -2.506758451461792, + "logits/rejected": -2.5981059074401855, + "logps/chosen": -204.51402282714844, + "logps/rejected": -173.1730194091797, + "loss": 510.3478, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 30.046728134155273, + "rewards/margins": -6.598879337310791, + "rewards/rejected": 36.645606994628906, + "step": 600 + }, + { + "epoch": 0.1197252208047105, + "grad_norm": 5009.758427999647, + "learning_rate": 4.994133741996982e-07, + "logits/chosen": -2.65734601020813, + "logits/rejected": -2.6545443534851074, + "logps/chosen": -258.2228088378906, + "logps/rejected": -210.0067901611328, + "loss": 484.3109, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 23.592992782592773, + "rewards/margins": -5.538031578063965, + "rewards/rejected": 29.131023406982422, + "step": 610 + }, + { + "epoch": 0.12168792934249265, + "grad_norm": 5363.664640832944, + "learning_rate": 4.992902411026877e-07, + "logits/chosen": -2.609527111053467, + "logits/rejected": -2.605999708175659, + "logps/chosen": -238.1632843017578, + "logps/rejected": -315.69329833984375, + "loss": 463.5376, + "rewards/accuracies": 0.5, + "rewards/chosen": 27.6790828704834, + "rewards/margins": 0.5688053369522095, + "rewards/rejected": 27.110280990600586, + "step": 620 + }, + { + "epoch": 0.12365063788027478, + "grad_norm": 4311.589890986141, + "learning_rate": 4.991554042662548e-07, + "logits/chosen": -2.5265004634857178, + "logits/rejected": -2.5692458152770996, + "logps/chosen": -211.8911590576172, + "logps/rejected": -210.7008056640625, + "loss": 440.8429, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 37.06153106689453, + "rewards/margins": 7.217554569244385, + "rewards/rejected": 29.84397315979004, + "step": 630 + }, + { + "epoch": 0.1256133464180569, + "grad_norm": 4609.647394782643, + "learning_rate": 4.990088700207525e-07, + "logits/chosen": -2.5927557945251465, + "logits/rejected": -2.6587564945220947, + "logps/chosen": -180.91180419921875, + "logps/rejected": -201.13131713867188, + "loss": 410.4563, + "rewards/accuracies": 0.4999999403953552, + "rewards/chosen": 28.912639617919922, + "rewards/margins": -10.199140548706055, + "rewards/rejected": 39.111778259277344, + "step": 640 + }, + { + "epoch": 0.12757605495583907, + "grad_norm": 4119.162413223579, + "learning_rate": 4.988506452457066e-07, + "logits/chosen": -2.570932388305664, + "logits/rejected": -2.645514488220215, + "logps/chosen": -258.30291748046875, + "logps/rejected": -267.41119384765625, + "loss": 445.8911, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 38.332210540771484, + "rewards/margins": 9.490628242492676, + "rewards/rejected": 28.841583251953125, + "step": 650 + }, + { + "epoch": 0.1295387634936212, + "grad_norm": 4789.877275203537, + "learning_rate": 4.986807373694925e-07, + "logits/chosen": -2.6149802207946777, + "logits/rejected": -2.5973472595214844, + "logps/chosen": -228.85330200195312, + "logps/rejected": -235.3708038330078, + "loss": 459.3869, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 35.13557815551758, + "rewards/margins": 2.73466157913208, + "rewards/rejected": 32.40091323852539, + "step": 660 + }, + { + "epoch": 0.13150147203140333, + "grad_norm": 4459.812327482822, + "learning_rate": 4.984991543689869e-07, + "logits/chosen": -2.660524368286133, + "logits/rejected": -2.5925040245056152, + "logps/chosen": -240.50296020507812, + "logps/rejected": -257.6099548339844, + "loss": 495.4315, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 42.89487838745117, + "rewards/margins": 19.89583969116211, + "rewards/rejected": 22.999040603637695, + "step": 670 + }, + { + "epoch": 0.13346418056918546, + "grad_norm": 4273.897641113578, + "learning_rate": 4.983059047691931e-07, + "logits/chosen": -2.6355478763580322, + "logits/rejected": -2.5462124347686768, + "logps/chosen": -227.7132568359375, + "logps/rejected": -188.1220703125, + "loss": 448.9462, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 30.760122299194336, + "rewards/margins": 4.663891792297363, + "rewards/rejected": 26.096227645874023, + "step": 680 + }, + { + "epoch": 0.13542688910696762, + "grad_norm": 4628.194176183738, + "learning_rate": 4.981009976428408e-07, + "logits/chosen": -2.48315691947937, + "logits/rejected": -2.4086811542510986, + "logps/chosen": -286.48016357421875, + "logps/rejected": -238.0033721923828, + "loss": 459.0327, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 24.821685791015625, + "rewards/margins": -3.5525615215301514, + "rewards/rejected": 28.374248504638672, + "step": 690 + }, + { + "epoch": 0.13738959764474976, + "grad_norm": 5417.6256406016655, + "learning_rate": 4.9788444260996e-07, + "logits/chosen": -2.6074776649475098, + "logits/rejected": -2.6096506118774414, + "logps/chosen": -235.46939086914062, + "logps/rejected": -211.37625122070312, + "loss": 433.4594, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 28.747339248657227, + "rewards/margins": -17.744359970092773, + "rewards/rejected": 46.49169921875, + "step": 700 + }, + { + "epoch": 0.1393523061825319, + "grad_norm": 5082.956665278541, + "learning_rate": 4.976562498374295e-07, + "logits/chosen": -2.6656737327575684, + "logits/rejected": -2.6231331825256348, + "logps/chosen": -257.89898681640625, + "logps/rejected": -230.61776733398438, + "loss": 477.1858, + "rewards/accuracies": 0.5, + "rewards/chosen": 31.849462509155273, + "rewards/margins": 3.5808677673339844, + "rewards/rejected": 28.268596649169922, + "step": 710 + }, + { + "epoch": 0.14131501472031405, + "grad_norm": 4683.782278834829, + "learning_rate": 4.974164300384997e-07, + "logits/chosen": -2.613971471786499, + "logits/rejected": -2.663684368133545, + "logps/chosen": -202.5010986328125, + "logps/rejected": -259.63616943359375, + "loss": 448.8101, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 28.70241355895996, + "rewards/margins": -5.085217475891113, + "rewards/rejected": 33.787635803222656, + "step": 720 + }, + { + "epoch": 0.14327772325809618, + "grad_norm": 4107.436513649201, + "learning_rate": 4.971649944722893e-07, + "logits/chosen": -2.601701498031616, + "logits/rejected": -2.654048204421997, + "logps/chosen": -228.1996612548828, + "logps/rejected": -249.9207763671875, + "loss": 463.9367, + "rewards/accuracies": 0.3333333432674408, + "rewards/chosen": 24.239540100097656, + "rewards/margins": -9.543767929077148, + "rewards/rejected": 33.78330993652344, + "step": 730 + }, + { + "epoch": 0.1452404317958783, + "grad_norm": 3839.3323324164153, + "learning_rate": 4.96901954943257e-07, + "logits/chosen": -2.5906546115875244, + "logits/rejected": -2.418152332305908, + "logps/chosen": -228.8038330078125, + "logps/rejected": -142.23724365234375, + "loss": 447.7127, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 36.81355285644531, + "rewards/margins": 12.445768356323242, + "rewards/rejected": 24.367786407470703, + "step": 740 + }, + { + "epoch": 0.14720314033366044, + "grad_norm": 4176.412649636955, + "learning_rate": 4.96627323800647e-07, + "logits/chosen": -2.565721035003662, + "logits/rejected": -2.5852303504943848, + "logps/chosen": -196.6484375, + "logps/rejected": -216.9084930419922, + "loss": 453.8092, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 41.040504455566406, + "rewards/margins": 13.594090461730957, + "rewards/rejected": 27.446414947509766, + "step": 750 + }, + { + "epoch": 0.1491658488714426, + "grad_norm": 3941.2012801097057, + "learning_rate": 4.963411139379099e-07, + "logits/chosen": -2.6596925258636475, + "logits/rejected": -2.585742950439453, + "logps/chosen": -259.2139587402344, + "logps/rejected": -231.1278839111328, + "loss": 470.8556, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 31.395431518554688, + "rewards/margins": 1.9789135456085205, + "rewards/rejected": 29.416519165039062, + "step": 760 + }, + { + "epoch": 0.15112855740922473, + "grad_norm": 4955.044390669766, + "learning_rate": 4.960433387920964e-07, + "logits/chosen": -2.5374937057495117, + "logits/rejected": -2.5497844219207764, + "logps/chosen": -145.56930541992188, + "logps/rejected": -266.82159423828125, + "loss": 446.4508, + "rewards/accuracies": 0.43333330750465393, + "rewards/chosen": 24.17589569091797, + "rewards/margins": -1.778869390487671, + "rewards/rejected": 25.95476722717285, + "step": 770 + }, + { + "epoch": 0.15309126594700687, + "grad_norm": 7089.744405843409, + "learning_rate": 4.957340123432271e-07, + "logits/chosen": -2.5326530933380127, + "logits/rejected": -2.404533863067627, + "logps/chosen": -292.24658203125, + "logps/rejected": -204.31930541992188, + "loss": 478.8755, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 40.32138442993164, + "rewards/margins": 12.539506912231445, + "rewards/rejected": 27.781875610351562, + "step": 780 + }, + { + "epoch": 0.155053974484789, + "grad_norm": 4122.114584173073, + "learning_rate": 4.954131491136361e-07, + "logits/chosen": -2.564535617828369, + "logits/rejected": -2.4996018409729004, + "logps/chosen": -287.51141357421875, + "logps/rejected": -249.18173217773438, + "loss": 503.4248, + "rewards/accuracies": 0.5, + "rewards/chosen": 37.882667541503906, + "rewards/margins": -1.5718473196029663, + "rewards/rejected": 39.45451736450195, + "step": 790 + }, + { + "epoch": 0.15701668302257116, + "grad_norm": 4788.833682169245, + "learning_rate": 4.95080764167289e-07, + "logits/chosen": -2.548109531402588, + "logits/rejected": -2.588792562484741, + "logps/chosen": -217.90963745117188, + "logps/rejected": -243.854248046875, + "loss": 491.3777, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 32.40715789794922, + "rewards/margins": 2.9248015880584717, + "rewards/rejected": 29.482357025146484, + "step": 800 + }, + { + "epoch": 0.1589793915603533, + "grad_norm": 4268.850779393387, + "learning_rate": 4.94736873109076e-07, + "logits/chosen": -2.6233417987823486, + "logits/rejected": -2.61574125289917, + "logps/chosen": -220.2720184326172, + "logps/rejected": -201.5345001220703, + "loss": 470.0953, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 32.03792953491211, + "rewards/margins": 1.7439085245132446, + "rewards/rejected": 30.294021606445312, + "step": 810 + }, + { + "epoch": 0.16094210009813542, + "grad_norm": 5328.822800569311, + "learning_rate": 4.943814920840787e-07, + "logits/chosen": -2.4270339012145996, + "logits/rejected": -2.389511823654175, + "logps/chosen": -241.12448120117188, + "logps/rejected": -219.40316772460938, + "loss": 431.8633, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 37.06961441040039, + "rewards/margins": -9.050786972045898, + "rewards/rejected": 46.12040328979492, + "step": 820 + }, + { + "epoch": 0.16290480863591755, + "grad_norm": 4376.334525428355, + "learning_rate": 4.940146377768126e-07, + "logits/chosen": -2.5405285358428955, + "logits/rejected": -2.48429536819458, + "logps/chosen": -232.83358764648438, + "logps/rejected": -197.3223419189453, + "loss": 413.9131, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 54.56694412231445, + "rewards/margins": 22.161144256591797, + "rewards/rejected": 32.40580368041992, + "step": 830 + }, + { + "epoch": 0.1648675171736997, + "grad_norm": 4537.304696118614, + "learning_rate": 4.936363274104441e-07, + "logits/chosen": -2.5849764347076416, + "logits/rejected": -2.5415334701538086, + "logps/chosen": -254.42129516601562, + "logps/rejected": -192.87167358398438, + "loss": 405.1104, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 31.997446060180664, + "rewards/margins": -1.940429925918579, + "rewards/rejected": 33.9378776550293, + "step": 840 + }, + { + "epoch": 0.16683022571148184, + "grad_norm": 3909.8911410130995, + "learning_rate": 4.932465787459808e-07, + "logits/chosen": -2.677730083465576, + "logits/rejected": -2.5695979595184326, + "logps/chosen": -232.7654266357422, + "logps/rejected": -210.0175323486328, + "loss": 442.1903, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 33.31116485595703, + "rewards/margins": -9.833333969116211, + "rewards/rejected": 43.14450454711914, + "step": 850 + }, + { + "epoch": 0.16879293424926398, + "grad_norm": 4668.004498413699, + "learning_rate": 4.92845410081439e-07, + "logits/chosen": -2.4772255420684814, + "logits/rejected": -2.484923839569092, + "logps/chosen": -227.3833465576172, + "logps/rejected": -249.98782348632812, + "loss": 430.9701, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 31.499420166015625, + "rewards/margins": -2.1714835166931152, + "rewards/rejected": 33.67090606689453, + "step": 860 + }, + { + "epoch": 0.17075564278704614, + "grad_norm": 4657.11755860737, + "learning_rate": 4.924328402509833e-07, + "logits/chosen": -2.5510215759277344, + "logits/rejected": -2.518038272857666, + "logps/chosen": -243.21200561523438, + "logps/rejected": -201.63858032226562, + "loss": 473.4434, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 30.57634925842285, + "rewards/margins": -4.8953399658203125, + "rewards/rejected": 35.4716911315918, + "step": 870 + }, + { + "epoch": 0.17271835132482827, + "grad_norm": 4840.204230588623, + "learning_rate": 4.920088886240434e-07, + "logits/chosen": -2.4730472564697266, + "logits/rejected": -2.36600923538208, + "logps/chosen": -251.2110595703125, + "logps/rejected": -236.5413055419922, + "loss": 422.1851, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 33.34375, + "rewards/margins": 10.204463005065918, + "rewards/rejected": 23.139286041259766, + "step": 880 + }, + { + "epoch": 0.1746810598626104, + "grad_norm": 4157.926315887367, + "learning_rate": 4.915735751044045e-07, + "logits/chosen": -2.721060276031494, + "logits/rejected": -2.609971284866333, + "logps/chosen": -245.4339141845703, + "logps/rejected": -206.713134765625, + "loss": 466.6165, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 42.93861389160156, + "rewards/margins": 3.1537113189697266, + "rewards/rejected": 39.7849006652832, + "step": 890 + }, + { + "epoch": 0.17664376840039253, + "grad_norm": 5288.463900214572, + "learning_rate": 4.911269201292724e-07, + "logits/chosen": -2.6828205585479736, + "logits/rejected": -2.6067633628845215, + "logps/chosen": -258.6050720214844, + "logps/rejected": -215.49777221679688, + "loss": 475.2619, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 35.75965881347656, + "rewards/margins": -6.9160919189453125, + "rewards/rejected": 42.675758361816406, + "step": 900 + }, + { + "epoch": 0.1786064769381747, + "grad_norm": 4588.2274681178, + "learning_rate": 4.906689446683146e-07, + "logits/chosen": -2.6247589588165283, + "logits/rejected": -2.706444501876831, + "logps/chosen": -205.7058563232422, + "logps/rejected": -279.2490234375, + "loss": 483.4424, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 31.428142547607422, + "rewards/margins": -6.55536413192749, + "rewards/rejected": 37.9835090637207, + "step": 910 + }, + { + "epoch": 0.18056918547595682, + "grad_norm": 4709.655035818351, + "learning_rate": 4.901996702226755e-07, + "logits/chosen": -2.504795789718628, + "logits/rejected": -2.537221908569336, + "logps/chosen": -267.9532165527344, + "logps/rejected": -299.85638427734375, + "loss": 460.9857, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 32.67220687866211, + "rewards/margins": -5.247345924377441, + "rewards/rejected": 37.91954803466797, + "step": 920 + }, + { + "epoch": 0.18253189401373895, + "grad_norm": 4431.199882462497, + "learning_rate": 4.897191188239667e-07, + "logits/chosen": -2.809670925140381, + "logits/rejected": -2.5074360370635986, + "logps/chosen": -285.9214172363281, + "logps/rejected": -185.4030303955078, + "loss": 437.1225, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": 38.11512756347656, + "rewards/margins": 12.34187126159668, + "rewards/rejected": 25.773258209228516, + "step": 930 + }, + { + "epoch": 0.1844946025515211, + "grad_norm": 5356.0935582822785, + "learning_rate": 4.892273130332334e-07, + "logits/chosen": -2.711378574371338, + "logits/rejected": -2.675032138824463, + "logps/chosen": -285.53460693359375, + "logps/rejected": -320.78167724609375, + "loss": 473.1405, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 42.49833297729492, + "rewards/margins": 3.7150447368621826, + "rewards/rejected": 38.783287048339844, + "step": 940 + }, + { + "epoch": 0.18645731108930325, + "grad_norm": 4591.265894345519, + "learning_rate": 4.887242759398945e-07, + "logits/chosen": -2.5174307823181152, + "logits/rejected": -2.3968613147735596, + "logps/chosen": -164.76437377929688, + "logps/rejected": -159.45059204101562, + "loss": 422.0927, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 32.820526123046875, + "rewards/margins": -1.7133598327636719, + "rewards/rejected": 34.53388977050781, + "step": 950 + }, + { + "epoch": 0.18842001962708538, + "grad_norm": 4501.703024071969, + "learning_rate": 4.88210031160659e-07, + "logits/chosen": -2.5425117015838623, + "logits/rejected": -2.5814175605773926, + "logps/chosen": -235.32461547851562, + "logps/rejected": -220.5, + "loss": 481.1401, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 31.44137954711914, + "rewards/margins": -0.7482231855392456, + "rewards/rejected": 32.189605712890625, + "step": 960 + }, + { + "epoch": 0.1903827281648675, + "grad_norm": 3761.653691869692, + "learning_rate": 4.876846028384169e-07, + "logits/chosen": -2.6352555751800537, + "logits/rejected": -2.5650863647460938, + "logps/chosen": -176.44219970703125, + "logps/rejected": -205.5272674560547, + "loss": 379.0157, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 27.441858291625977, + "rewards/margins": -2.6123874187469482, + "rewards/rejected": 30.054244995117188, + "step": 970 + }, + { + "epoch": 0.19234543670264967, + "grad_norm": 3688.16826691894, + "learning_rate": 4.87148015641106e-07, + "logits/chosen": -2.642768144607544, + "logits/rejected": -2.6509671211242676, + "logps/chosen": -214.3375244140625, + "logps/rejected": -244.3864288330078, + "loss": 477.9643, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 36.71364974975586, + "rewards/margins": 6.217877388000488, + "rewards/rejected": 30.495773315429688, + "step": 980 + }, + { + "epoch": 0.1943081452404318, + "grad_norm": 3668.1326136631747, + "learning_rate": 4.866002947605539e-07, + "logits/chosen": -2.5380349159240723, + "logits/rejected": -2.4647417068481445, + "logps/chosen": -208.49099731445312, + "logps/rejected": -204.13284301757812, + "loss": 395.11, + "rewards/accuracies": 0.3333333134651184, + "rewards/chosen": 28.401355743408203, + "rewards/margins": -10.098089218139648, + "rewards/rejected": 38.49944305419922, + "step": 990 + }, + { + "epoch": 0.19627085377821393, + "grad_norm": 4429.484309033255, + "learning_rate": 4.860414659112948e-07, + "logits/chosen": -2.6225905418395996, + "logits/rejected": -2.456757068634033, + "logps/chosen": -207.32296752929688, + "logps/rejected": -165.3435516357422, + "loss": 445.1423, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 30.326162338256836, + "rewards/margins": 2.9115099906921387, + "rewards/rejected": 27.41465187072754, + "step": 1000 + }, + { + "epoch": 0.19823356231599606, + "grad_norm": 4709.9204917731795, + "learning_rate": 4.854715553293627e-07, + "logits/chosen": -2.6830430030822754, + "logits/rejected": -2.5512986183166504, + "logps/chosen": -270.9964904785156, + "logps/rejected": -176.79745483398438, + "loss": 496.4829, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": 43.204612731933594, + "rewards/margins": 17.4560546875, + "rewards/rejected": 25.74855613708496, + "step": 1010 + }, + { + "epoch": 0.20019627085377822, + "grad_norm": 3895.548818863484, + "learning_rate": 4.848905897710595e-07, + "logits/chosen": -2.3969950675964355, + "logits/rejected": -2.3870437145233154, + "logps/chosen": -277.4095764160156, + "logps/rejected": -171.2035369873047, + "loss": 464.7826, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 33.65400314331055, + "rewards/margins": -12.002626419067383, + "rewards/rejected": 45.65662384033203, + "step": 1020 + }, + { + "epoch": 0.20215897939156036, + "grad_norm": 5888.578415290401, + "learning_rate": 4.842985965116987e-07, + "logits/chosen": -2.638823986053467, + "logits/rejected": -2.5890250205993652, + "logps/chosen": -303.5338439941406, + "logps/rejected": -239.8367919921875, + "loss": 473.8742, + "rewards/accuracies": 0.5, + "rewards/chosen": 37.500709533691406, + "rewards/margins": -4.421719551086426, + "rewards/rejected": 41.92242431640625, + "step": 1030 + }, + { + "epoch": 0.2041216879293425, + "grad_norm": 13266.38798683862, + "learning_rate": 4.836956033443253e-07, + "logits/chosen": -2.5955393314361572, + "logits/rejected": -2.5155348777770996, + "logps/chosen": -275.0190124511719, + "logps/rejected": -253.9836883544922, + "loss": 465.175, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 92.16295623779297, + "rewards/margins": -35.01982879638672, + "rewards/rejected": 127.18278503417969, + "step": 1040 + }, + { + "epoch": 0.20608439646712462, + "grad_norm": 4932.727575414187, + "learning_rate": 4.830816385784104e-07, + "logits/chosen": -2.684427499771118, + "logits/rejected": -2.6435837745666504, + "logps/chosen": -273.91754150390625, + "logps/rejected": -245.6715850830078, + "loss": 466.2123, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 33.9106330871582, + "rewards/margins": 0.9209011197090149, + "rewards/rejected": 32.9897346496582, + "step": 1050 + }, + { + "epoch": 0.20804710500490678, + "grad_norm": 4776.035725293843, + "learning_rate": 4.824567310385226e-07, + "logits/chosen": -2.6558475494384766, + "logits/rejected": -2.6177608966827393, + "logps/chosen": -286.10113525390625, + "logps/rejected": -227.9519805908203, + "loss": 482.4986, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 31.268722534179688, + "rewards/margins": -2.8335373401641846, + "rewards/rejected": 34.102256774902344, + "step": 1060 + }, + { + "epoch": 0.2100098135426889, + "grad_norm": 5901.273056179599, + "learning_rate": 4.818209100629744e-07, + "logits/chosen": -2.5953338146209717, + "logits/rejected": -2.552004814147949, + "logps/chosen": -203.58914184570312, + "logps/rejected": -225.4922332763672, + "loss": 445.1609, + "rewards/accuracies": 0.5, + "rewards/chosen": 45.0809440612793, + "rewards/margins": -0.3565046191215515, + "rewards/rejected": 45.43744659423828, + "step": 1070 + }, + { + "epoch": 0.21197252208047104, + "grad_norm": 3305.566013387382, + "learning_rate": 4.81174205502445e-07, + "logits/chosen": -2.5669751167297363, + "logits/rejected": -2.6002602577209473, + "logps/chosen": -190.98147583007812, + "logps/rejected": -173.95994567871094, + "loss": 380.1803, + "rewards/accuracies": 0.5, + "rewards/chosen": 30.217853546142578, + "rewards/margins": -7.351622581481934, + "rewards/rejected": 37.56947326660156, + "step": 1080 + }, + { + "epoch": 0.2139352306182532, + "grad_norm": 3711.398480605608, + "learning_rate": 4.80516647718579e-07, + "logits/chosen": -2.5809082984924316, + "logits/rejected": -2.51855206489563, + "logps/chosen": -204.2241973876953, + "logps/rejected": -218.84872436523438, + "loss": 390.2722, + "rewards/accuracies": 0.5666666030883789, + "rewards/chosen": 35.5487174987793, + "rewards/margins": 0.2038247287273407, + "rewards/rejected": 35.34489059448242, + "step": 1090 + }, + { + "epoch": 0.21589793915603533, + "grad_norm": 3952.008170028738, + "learning_rate": 4.798482675825602e-07, + "logits/chosen": -2.66428279876709, + "logits/rejected": -2.694563388824463, + "logps/chosen": -179.92654418945312, + "logps/rejected": -216.7838897705078, + "loss": 455.5333, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 28.27036476135254, + "rewards/margins": -3.186108350753784, + "rewards/rejected": 31.456470489501953, + "step": 1100 + }, + { + "epoch": 0.21786064769381747, + "grad_norm": 5288.6150146613545, + "learning_rate": 4.791690964736636e-07, + "logits/chosen": -2.5695927143096924, + "logits/rejected": -2.5862300395965576, + "logps/chosen": -244.2303009033203, + "logps/rejected": -182.8033447265625, + "loss": 417.4308, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 67.5400619506836, + "rewards/margins": 35.33257293701172, + "rewards/rejected": 32.207496643066406, + "step": 1110 + }, + { + "epoch": 0.2198233562315996, + "grad_norm": 5347.641800993126, + "learning_rate": 4.78479166277781e-07, + "logits/chosen": -2.6153831481933594, + "logits/rejected": -2.4904675483703613, + "logps/chosen": -298.97479248046875, + "logps/rejected": -240.39431762695312, + "loss": 515.0782, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 32.2321891784668, + "rewards/margins": -12.380395889282227, + "rewards/rejected": 44.612586975097656, + "step": 1120 + }, + { + "epoch": 0.22178606476938176, + "grad_norm": 5258.094474663755, + "learning_rate": 4.777785093859247e-07, + "logits/chosen": -2.770720958709717, + "logits/rejected": -2.547207832336426, + "logps/chosen": -241.500244140625, + "logps/rejected": -252.56668090820312, + "loss": 480.3711, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 43.165924072265625, + "rewards/margins": 3.580148220062256, + "rewards/rejected": 39.585777282714844, + "step": 1130 + }, + { + "epoch": 0.2237487733071639, + "grad_norm": 5553.182619936151, + "learning_rate": 4.770671586927063e-07, + "logits/chosen": -2.7911696434020996, + "logits/rejected": -2.684234142303467, + "logps/chosen": -317.8401794433594, + "logps/rejected": -291.2255859375, + "loss": 496.6098, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 39.029273986816406, + "rewards/margins": 1.6814048290252686, + "rewards/rejected": 37.347869873046875, + "step": 1140 + }, + { + "epoch": 0.22571148184494602, + "grad_norm": 4408.1202191004995, + "learning_rate": 4.7634514759479275e-07, + "logits/chosen": -2.7609219551086426, + "logits/rejected": -2.6142373085021973, + "logps/chosen": -247.8801727294922, + "logps/rejected": -194.33895874023438, + "loss": 461.1519, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 37.78066635131836, + "rewards/margins": 11.475553512573242, + "rewards/rejected": 26.305110931396484, + "step": 1150 + }, + { + "epoch": 0.22767419038272815, + "grad_norm": 5480.72496037876, + "learning_rate": 4.7561250998933835e-07, + "logits/chosen": -2.5984532833099365, + "logits/rejected": -2.4840378761291504, + "logps/chosen": -309.9330139160156, + "logps/rejected": -181.5688018798828, + "loss": 490.8118, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 34.388038635253906, + "rewards/margins": 1.5576369762420654, + "rewards/rejected": 32.830406188964844, + "step": 1160 + }, + { + "epoch": 0.2296368989205103, + "grad_norm": 4453.143374911204, + "learning_rate": 4.7486928027239304e-07, + "logits/chosen": -2.5082366466522217, + "logits/rejected": -2.4943606853485107, + "logps/chosen": -179.74754333496094, + "logps/rejected": -184.5972137451172, + "loss": 471.2, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 25.90127944946289, + "rewards/margins": -1.3986310958862305, + "rewards/rejected": 27.299907684326172, + "step": 1170 + }, + { + "epoch": 0.23159960745829244, + "grad_norm": 5802.440683508613, + "learning_rate": 4.7411549333728807e-07, + "logits/chosen": -2.5889554023742676, + "logits/rejected": -2.634286403656006, + "logps/chosen": -247.23440551757812, + "logps/rejected": -242.90347290039062, + "loss": 495.7646, + "rewards/accuracies": 0.36666667461395264, + "rewards/chosen": 31.239521026611328, + "rewards/margins": -10.11313247680664, + "rewards/rejected": 41.352657318115234, + "step": 1180 + }, + { + "epoch": 0.23356231599607458, + "grad_norm": 4881.3901064765905, + "learning_rate": 4.7335118457299756e-07, + "logits/chosen": -2.605823040008545, + "logits/rejected": -2.655714750289917, + "logps/chosen": -257.37359619140625, + "logps/rejected": -226.6381378173828, + "loss": 467.4191, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 30.597707748413086, + "rewards/margins": -4.192519187927246, + "rewards/rejected": 34.790225982666016, + "step": 1190 + }, + { + "epoch": 0.23552502453385674, + "grad_norm": 4259.232291896253, + "learning_rate": 4.7257638986247684e-07, + "logits/chosen": -2.685166120529175, + "logits/rejected": -2.6258645057678223, + "logps/chosen": -263.2113342285156, + "logps/rejected": -298.49688720703125, + "loss": 446.8362, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 33.98692321777344, + "rewards/margins": -12.568541526794434, + "rewards/rejected": 46.55546188354492, + "step": 1200 + }, + { + "epoch": 0.23748773307163887, + "grad_norm": 4436.6755123445055, + "learning_rate": 4.7179114558097814e-07, + "logits/chosen": -2.593357801437378, + "logits/rejected": -2.5329232215881348, + "logps/chosen": -226.9739227294922, + "logps/rejected": -177.63357543945312, + "loss": 467.4614, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 32.3291015625, + "rewards/margins": 5.060539722442627, + "rewards/rejected": 27.268564224243164, + "step": 1210 + }, + { + "epoch": 0.239450441609421, + "grad_norm": 4013.2887284238113, + "learning_rate": 4.709954885943428e-07, + "logits/chosen": -2.6918628215789795, + "logits/rejected": -2.6806817054748535, + "logps/chosen": -240.38052368164062, + "logps/rejected": -189.43763732910156, + "loss": 403.3918, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 32.36589431762695, + "rewards/margins": 4.384322166442871, + "rewards/rejected": 27.9815731048584, + "step": 1220 + }, + { + "epoch": 0.24141315014720313, + "grad_norm": 4930.575310666661, + "learning_rate": 4.7018945625727026e-07, + "logits/chosen": -2.7886033058166504, + "logits/rejected": -2.706328868865967, + "logps/chosen": -245.9519500732422, + "logps/rejected": -245.0874481201172, + "loss": 488.3562, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 31.739898681640625, + "rewards/margins": 7.568617343902588, + "rewards/rejected": 24.171281814575195, + "step": 1230 + }, + { + "epoch": 0.2433758586849853, + "grad_norm": 3942.7295284984402, + "learning_rate": 4.6937308641156447e-07, + "logits/chosen": -2.605045795440674, + "logits/rejected": -2.567878007888794, + "logps/chosen": -178.3212432861328, + "logps/rejected": -180.12820434570312, + "loss": 406.4432, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 31.777252197265625, + "rewards/margins": -0.4026016294956207, + "rewards/rejected": 32.17985916137695, + "step": 1240 + }, + { + "epoch": 0.24533856722276742, + "grad_norm": 4262.859818723618, + "learning_rate": 4.685464173843574e-07, + "logits/chosen": -2.6398653984069824, + "logits/rejected": -2.648780345916748, + "logps/chosen": -197.7738037109375, + "logps/rejected": -225.6578826904297, + "loss": 441.8806, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 36.21776580810547, + "rewards/margins": 6.563495635986328, + "rewards/rejected": 29.654272079467773, + "step": 1250 + }, + { + "epoch": 0.24730127576054955, + "grad_norm": 5397.937316901458, + "learning_rate": 4.677094879863093e-07, + "logits/chosen": -2.6843018531799316, + "logits/rejected": -2.5696897506713867, + "logps/chosen": -220.08462524414062, + "logps/rejected": -189.0897216796875, + "loss": 482.1675, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 28.3251895904541, + "rewards/margins": 0.6852840185165405, + "rewards/rejected": 27.639904022216797, + "step": 1260 + }, + { + "epoch": 0.2492639842983317, + "grad_norm": 5100.629009033718, + "learning_rate": 4.66862337509787e-07, + "logits/chosen": -2.6803290843963623, + "logits/rejected": -2.6703336238861084, + "logps/chosen": -281.07177734375, + "logps/rejected": -186.40174865722656, + "loss": 521.8964, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 39.752471923828125, + "rewards/margins": 5.96087646484375, + "rewards/rejected": 33.79159164428711, + "step": 1270 + }, + { + "epoch": 0.2512266928361138, + "grad_norm": 4595.979870084018, + "learning_rate": 4.660050057270191e-07, + "logits/chosen": -2.6391656398773193, + "logits/rejected": -2.45827054977417, + "logps/chosen": -219.8842010498047, + "logps/rejected": -184.69334411621094, + "loss": 478.3666, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 36.203399658203125, + "rewards/margins": 6.85970401763916, + "rewards/rejected": 29.34369468688965, + "step": 1280 + }, + { + "epoch": 0.25318940137389595, + "grad_norm": 4158.646585214779, + "learning_rate": 4.6513753288822833e-07, + "logits/chosen": -2.6670408248901367, + "logits/rejected": -2.617326259613037, + "logps/chosen": -126.4794692993164, + "logps/rejected": -147.56619262695312, + "loss": 373.9057, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 32.822105407714844, + "rewards/margins": 7.652844429016113, + "rewards/rejected": 25.169261932373047, + "step": 1290 + }, + { + "epoch": 0.25515210991167814, + "grad_norm": 4963.405587351429, + "learning_rate": 4.6425995971974265e-07, + "logits/chosen": -2.599966287612915, + "logits/rejected": -2.575037956237793, + "logps/chosen": -255.7868194580078, + "logps/rejected": -174.5104522705078, + "loss": 466.948, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 27.49749755859375, + "rewards/margins": -15.012187004089355, + "rewards/rejected": 42.50968933105469, + "step": 1300 + }, + { + "epoch": 0.25711481844946027, + "grad_norm": 5415.187880414271, + "learning_rate": 4.633723274220824e-07, + "logits/chosen": -2.679082155227661, + "logits/rejected": -2.613781690597534, + "logps/chosen": -257.79620361328125, + "logps/rejected": -279.03985595703125, + "loss": 522.431, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 39.13835906982422, + "rewards/margins": -5.3103437423706055, + "rewards/rejected": 44.448707580566406, + "step": 1310 + }, + { + "epoch": 0.2590775269872424, + "grad_norm": 2682.375646417619, + "learning_rate": 4.624746776680267e-07, + "logits/chosen": -2.4919180870056152, + "logits/rejected": -2.525841474533081, + "logps/chosen": -224.8561553955078, + "logps/rejected": -211.89932250976562, + "loss": 418.4607, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 23.724262237548828, + "rewards/margins": -11.103318214416504, + "rewards/rejected": 34.827579498291016, + "step": 1320 + }, + { + "epoch": 0.26104023552502453, + "grad_norm": 4531.628992068344, + "learning_rate": 4.6156705260065634e-07, + "logits/chosen": -2.464676856994629, + "logits/rejected": -2.4597787857055664, + "logps/chosen": -178.28427124023438, + "logps/rejected": -178.5281219482422, + "loss": 423.1229, + "rewards/accuracies": 0.2666666507720947, + "rewards/chosen": 30.889575958251953, + "rewards/margins": -15.716069221496582, + "rewards/rejected": 46.60564041137695, + "step": 1330 + }, + { + "epoch": 0.26300294406280667, + "grad_norm": 4075.9441228112637, + "learning_rate": 4.606494948313758e-07, + "logits/chosen": -2.5901541709899902, + "logits/rejected": -2.497117042541504, + "logps/chosen": -204.6013946533203, + "logps/rejected": -216.5972442626953, + "loss": 445.6849, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 37.51020050048828, + "rewards/margins": -1.5504966974258423, + "rewards/rejected": 39.060699462890625, + "step": 1340 + }, + { + "epoch": 0.2649656526005888, + "grad_norm": 5057.830525011509, + "learning_rate": 4.597220474379125e-07, + "logits/chosen": -2.656686782836914, + "logits/rejected": -2.7043204307556152, + "logps/chosen": -296.2122802734375, + "logps/rejected": -289.3331604003906, + "loss": 464.1113, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 31.062763214111328, + "rewards/margins": -2.759782552719116, + "rewards/rejected": 33.822547912597656, + "step": 1350 + }, + { + "epoch": 0.26692836113837093, + "grad_norm": 3919.94236710167, + "learning_rate": 4.587847539622942e-07, + "logits/chosen": -2.622345447540283, + "logits/rejected": -2.6256356239318848, + "logps/chosen": -341.5245361328125, + "logps/rejected": -288.871337890625, + "loss": 485.2092, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 36.633548736572266, + "rewards/margins": 2.233851671218872, + "rewards/rejected": 34.39970016479492, + "step": 1360 + }, + { + "epoch": 0.2688910696761531, + "grad_norm": 4920.1016303208735, + "learning_rate": 4.5783765840880505e-07, + "logits/chosen": -2.7265937328338623, + "logits/rejected": -2.666107416152954, + "logps/chosen": -282.02496337890625, + "logps/rejected": -271.85833740234375, + "loss": 492.7476, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 46.0014533996582, + "rewards/margins": 11.102203369140625, + "rewards/rejected": 34.89925003051758, + "step": 1370 + }, + { + "epoch": 0.27085377821393525, + "grad_norm": 6729.628673965608, + "learning_rate": 4.568808052419196e-07, + "logits/chosen": -2.6116538047790527, + "logits/rejected": -2.5717697143554688, + "logps/chosen": -194.3059844970703, + "logps/rejected": -184.2753143310547, + "loss": 464.076, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 30.298664093017578, + "rewards/margins": -3.016819477081299, + "rewards/rejected": 33.31548309326172, + "step": 1380 + }, + { + "epoch": 0.2728164867517174, + "grad_norm": 4537.998136989965, + "learning_rate": 4.5591423938421513e-07, + "logits/chosen": -2.5770394802093506, + "logits/rejected": -2.492131471633911, + "logps/chosen": -244.56942749023438, + "logps/rejected": -211.59860229492188, + "loss": 472.8399, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 39.29642105102539, + "rewards/margins": -7.591795921325684, + "rewards/rejected": 46.888221740722656, + "step": 1390 + }, + { + "epoch": 0.2747791952894995, + "grad_norm": 4061.718067103401, + "learning_rate": 4.549380062142627e-07, + "logits/chosen": -2.6460325717926025, + "logits/rejected": -2.594343900680542, + "logps/chosen": -225.2256317138672, + "logps/rejected": -253.9397735595703, + "loss": 511.3023, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 35.28333282470703, + "rewards/margins": -4.53452205657959, + "rewards/rejected": 39.81785583496094, + "step": 1400 + }, + { + "epoch": 0.27674190382728164, + "grad_norm": 6543.838463953099, + "learning_rate": 4.5395215156449683e-07, + "logits/chosen": -2.5624992847442627, + "logits/rejected": -2.5937201976776123, + "logps/chosen": -255.32485961914062, + "logps/rejected": -288.13134765625, + "loss": 467.8676, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 47.551822662353516, + "rewards/margins": -13.412294387817383, + "rewards/rejected": 60.9641227722168, + "step": 1410 + }, + { + "epoch": 0.2787046123650638, + "grad_norm": 4174.873530739881, + "learning_rate": 4.5295672171906365e-07, + "logits/chosen": -2.61214542388916, + "logits/rejected": -2.493485450744629, + "logps/chosen": -234.7046661376953, + "logps/rejected": -192.59169006347656, + "loss": 454.9075, + "rewards/accuracies": 0.5, + "rewards/chosen": 30.760204315185547, + "rewards/margins": -4.985699653625488, + "rewards/rejected": 35.74591064453125, + "step": 1420 + }, + { + "epoch": 0.2806673209028459, + "grad_norm": 3618.032412740325, + "learning_rate": 4.5195176341164765e-07, + "logits/chosen": -2.6637091636657715, + "logits/rejected": -2.6291720867156982, + "logps/chosen": -235.5375518798828, + "logps/rejected": -263.6867370605469, + "loss": 512.698, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 32.95092010498047, + "rewards/margins": 1.7986339330673218, + "rewards/rejected": 31.152286529541016, + "step": 1430 + }, + { + "epoch": 0.2826300294406281, + "grad_norm": 4437.597451759081, + "learning_rate": 4.509373238232782e-07, + "logits/chosen": -2.4831185340881348, + "logits/rejected": -2.482316732406616, + "logps/chosen": -234.2296142578125, + "logps/rejected": -185.9305419921875, + "loss": 484.7292, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 64.58255004882812, + "rewards/margins": 16.714292526245117, + "rewards/rejected": 47.868255615234375, + "step": 1440 + }, + { + "epoch": 0.2845927379784102, + "grad_norm": 4372.421620305732, + "learning_rate": 4.499134505801141e-07, + "logits/chosen": -2.519425868988037, + "logits/rejected": -2.5516860485076904, + "logps/chosen": -203.31507873535156, + "logps/rejected": -210.93594360351562, + "loss": 418.6516, + "rewards/accuracies": 0.3333333432674408, + "rewards/chosen": 27.685678482055664, + "rewards/margins": -13.57408618927002, + "rewards/rejected": 41.259761810302734, + "step": 1450 + }, + { + "epoch": 0.28655544651619236, + "grad_norm": 4621.812160949887, + "learning_rate": 4.488801917512076e-07, + "logits/chosen": -2.62172269821167, + "logits/rejected": -2.729160785675049, + "logps/chosen": -257.2018127441406, + "logps/rejected": -292.1290588378906, + "loss": 477.9748, + "rewards/accuracies": 0.5, + "rewards/chosen": 32.220680236816406, + "rewards/margins": -12.17983627319336, + "rewards/rejected": 44.4005126953125, + "step": 1460 + }, + { + "epoch": 0.2885181550539745, + "grad_norm": 5196.492652500245, + "learning_rate": 4.478375958462479e-07, + "logits/chosen": -2.566138744354248, + "logits/rejected": -2.452779531478882, + "logps/chosen": -257.0789794921875, + "logps/rejected": -185.4340362548828, + "loss": 460.3571, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 41.83165740966797, + "rewards/margins": 1.7921245098114014, + "rewards/rejected": 40.039527893066406, + "step": 1470 + }, + { + "epoch": 0.2904808635917566, + "grad_norm": 4148.920158831787, + "learning_rate": 4.467857118132833e-07, + "logits/chosen": -2.556814670562744, + "logits/rejected": -2.593320846557617, + "logps/chosen": -215.48593139648438, + "logps/rejected": -211.2922821044922, + "loss": 457.0326, + "rewards/accuracies": 0.5, + "rewards/chosen": 31.426509857177734, + "rewards/margins": -1.3721458911895752, + "rewards/rejected": 32.79865646362305, + "step": 1480 + }, + { + "epoch": 0.29244357212953875, + "grad_norm": 4273.569689206981, + "learning_rate": 4.457245890364235e-07, + "logits/chosen": -2.646486759185791, + "logits/rejected": -2.502842426300049, + "logps/chosen": -308.3550109863281, + "logps/rejected": -232.5363006591797, + "loss": 479.2763, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": 41.21160888671875, + "rewards/margins": 9.703352928161621, + "rewards/rejected": 31.508255004882812, + "step": 1490 + }, + { + "epoch": 0.2944062806673209, + "grad_norm": 4652.619016984834, + "learning_rate": 4.4465427733352124e-07, + "logits/chosen": -2.576737642288208, + "logits/rejected": -2.547116279602051, + "logps/chosen": -243.11770629882812, + "logps/rejected": -234.5743865966797, + "loss": 433.5632, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 39.526947021484375, + "rewards/margins": 1.5665864944458008, + "rewards/rejected": 37.960357666015625, + "step": 1500 + }, + { + "epoch": 0.296368989205103, + "grad_norm": 5026.282262955393, + "learning_rate": 4.43574826953833e-07, + "logits/chosen": -2.5200655460357666, + "logits/rejected": -2.495128631591797, + "logps/chosen": -278.53692626953125, + "logps/rejected": -280.415283203125, + "loss": 453.0888, + "rewards/accuracies": 0.29999998211860657, + "rewards/chosen": 32.80377960205078, + "rewards/margins": -12.697413444519043, + "rewards/rejected": 45.501197814941406, + "step": 1510 + }, + { + "epoch": 0.2983316977428852, + "grad_norm": 3558.0012849900972, + "learning_rate": 4.4248628857565997e-07, + "logits/chosen": -2.4504408836364746, + "logits/rejected": -2.351283550262451, + "logps/chosen": -299.6264953613281, + "logps/rejected": -180.04957580566406, + "loss": 428.5945, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 36.6990966796875, + "rewards/margins": 4.336518287658691, + "rewards/rejected": 32.36257553100586, + "step": 1520 + }, + { + "epoch": 0.30029440628066734, + "grad_norm": 4483.806390292095, + "learning_rate": 4.413887133039692e-07, + "logits/chosen": -2.776367664337158, + "logits/rejected": -2.4363582134246826, + "logps/chosen": -366.796630859375, + "logps/rejected": -245.8273162841797, + "loss": 497.7816, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 42.83641815185547, + "rewards/margins": 14.016261100769043, + "rewards/rejected": 28.82015609741211, + "step": 1530 + }, + { + "epoch": 0.30225711481844947, + "grad_norm": 4316.725031558483, + "learning_rate": 4.4028215266799395e-07, + "logits/chosen": -2.6478710174560547, + "logits/rejected": -2.518744945526123, + "logps/chosen": -237.1392822265625, + "logps/rejected": -193.0914764404297, + "loss": 463.9993, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 35.98551559448242, + "rewards/margins": 7.529461860656738, + "rewards/rejected": 28.456050872802734, + "step": 1540 + }, + { + "epoch": 0.3042198233562316, + "grad_norm": 4976.322296464837, + "learning_rate": 4.391666586188145e-07, + "logits/chosen": -2.482912302017212, + "logits/rejected": -2.411414623260498, + "logps/chosen": -181.9075469970703, + "logps/rejected": -195.3955535888672, + "loss": 406.0816, + "rewards/accuracies": 0.43333330750465393, + "rewards/chosen": 30.790664672851562, + "rewards/margins": -0.060872841626405716, + "rewards/rejected": 30.85154151916504, + "step": 1550 + }, + { + "epoch": 0.30618253189401373, + "grad_norm": 4631.128903009067, + "learning_rate": 4.380422835269193e-07, + "logits/chosen": -2.6279826164245605, + "logits/rejected": -2.5860679149627686, + "logps/chosen": -221.2560577392578, + "logps/rejected": -227.3316192626953, + "loss": 449.2048, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 32.71919631958008, + "rewards/margins": -6.1832380294799805, + "rewards/rejected": 38.90243148803711, + "step": 1560 + }, + { + "epoch": 0.30814524043179586, + "grad_norm": 3520.0287409004613, + "learning_rate": 4.3690908017974596e-07, + "logits/chosen": -2.519622802734375, + "logits/rejected": -2.4551331996917725, + "logps/chosen": -193.1994171142578, + "logps/rejected": -201.52053833007812, + "loss": 396.6672, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 32.537864685058594, + "rewards/margins": -39.2354736328125, + "rewards/rejected": 71.7733383178711, + "step": 1570 + }, + { + "epoch": 0.310107948969578, + "grad_norm": 5125.863786211232, + "learning_rate": 4.3576710177920356e-07, + "logits/chosen": -2.6030383110046387, + "logits/rejected": -2.582373857498169, + "logps/chosen": -198.5616912841797, + "logps/rejected": -195.01638793945312, + "loss": 405.4208, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 37.846439361572266, + "rewards/margins": 2.210106372833252, + "rewards/rejected": 35.636329650878906, + "step": 1580 + }, + { + "epoch": 0.3120706575073602, + "grad_norm": 3936.5961043796497, + "learning_rate": 4.346164019391742e-07, + "logits/chosen": -2.6877033710479736, + "logits/rejected": -2.572878360748291, + "logps/chosen": -327.56475830078125, + "logps/rejected": -299.89569091796875, + "loss": 500.4787, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 33.7691650390625, + "rewards/margins": 0.5889126062393188, + "rewards/rejected": 33.18025588989258, + "step": 1590 + }, + { + "epoch": 0.3140333660451423, + "grad_norm": 3855.976180413627, + "learning_rate": 4.3345703468299634e-07, + "logits/chosen": -2.392334461212158, + "logits/rejected": -2.389955759048462, + "logps/chosen": -237.44192504882812, + "logps/rejected": -223.8633270263672, + "loss": 473.9611, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 40.286521911621094, + "rewards/margins": -6.727256774902344, + "rewards/rejected": 47.01377868652344, + "step": 1600 + }, + { + "epoch": 0.31599607458292445, + "grad_norm": 5009.958399563597, + "learning_rate": 4.322890544409286e-07, + "logits/chosen": -2.5345873832702637, + "logits/rejected": -2.32863187789917, + "logps/chosen": -261.5216369628906, + "logps/rejected": -230.5297088623047, + "loss": 489.7756, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 52.92722702026367, + "rewards/margins": 24.752338409423828, + "rewards/rejected": 28.174877166748047, + "step": 1610 + }, + { + "epoch": 0.3179587831207066, + "grad_norm": 3977.265195051046, + "learning_rate": 4.311125160475938e-07, + "logits/chosen": -2.5485174655914307, + "logits/rejected": -2.572300910949707, + "logps/chosen": -235.796630859375, + "logps/rejected": -310.8801574707031, + "loss": 473.5826, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 36.304649353027344, + "rewards/margins": -5.325404167175293, + "rewards/rejected": 41.63005828857422, + "step": 1620 + }, + { + "epoch": 0.3199214916584887, + "grad_norm": 4937.364126383082, + "learning_rate": 4.299274747394055e-07, + "logits/chosen": -2.4575133323669434, + "logits/rejected": -2.456610679626465, + "logps/chosen": -230.38827514648438, + "logps/rejected": -210.62454223632812, + "loss": 445.8371, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 35.976993560791016, + "rewards/margins": 4.183018684387207, + "rewards/rejected": 31.793975830078125, + "step": 1630 + }, + { + "epoch": 0.32188420019627084, + "grad_norm": 5426.95587112137, + "learning_rate": 4.287339861519737e-07, + "logits/chosen": -2.54447340965271, + "logits/rejected": -2.5630240440368652, + "logps/chosen": -260.1687927246094, + "logps/rejected": -245.56338500976562, + "loss": 456.7415, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 39.52373504638672, + "rewards/margins": 2.219897747039795, + "rewards/rejected": 37.303836822509766, + "step": 1640 + }, + { + "epoch": 0.323846908734053, + "grad_norm": 4842.585858336256, + "learning_rate": 4.275321063174936e-07, + "logits/chosen": -2.6207590103149414, + "logits/rejected": -2.650418281555176, + "logps/chosen": -298.9424133300781, + "logps/rejected": -225.19558715820312, + "loss": 488.1541, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 37.705909729003906, + "rewards/margins": 7.440756320953369, + "rewards/rejected": 30.265148162841797, + "step": 1650 + }, + { + "epoch": 0.3258096172718351, + "grad_norm": 4378.29508595839, + "learning_rate": 4.2632189166211454e-07, + "logits/chosen": -2.476266622543335, + "logits/rejected": -2.595913887023926, + "logps/chosen": -204.14492797851562, + "logps/rejected": -215.26126098632812, + "loss": 449.5967, + "rewards/accuracies": 0.40000003576278687, + "rewards/chosen": 33.3272705078125, + "rewards/margins": -15.386652946472168, + "rewards/rejected": 48.71392059326172, + "step": 1660 + }, + { + "epoch": 0.3277723258096173, + "grad_norm": 4410.765655477834, + "learning_rate": 4.251033990032912e-07, + "logits/chosen": -2.563709259033203, + "logits/rejected": -2.550846576690674, + "logps/chosen": -253.3117218017578, + "logps/rejected": -275.704833984375, + "loss": 464.9021, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 43.785118103027344, + "rewards/margins": 15.132052421569824, + "rewards/rejected": 28.653064727783203, + "step": 1670 + }, + { + "epoch": 0.3297350343473994, + "grad_norm": 4903.83578654043, + "learning_rate": 4.238766855471161e-07, + "logits/chosen": -2.5991299152374268, + "logits/rejected": -2.599780321121216, + "logps/chosen": -297.52630615234375, + "logps/rejected": -195.06985473632812, + "loss": 473.8348, + "rewards/accuracies": 0.5, + "rewards/chosen": 49.35599899291992, + "rewards/margins": 9.37015438079834, + "rewards/rejected": 39.98583984375, + "step": 1680 + }, + { + "epoch": 0.33169774288518156, + "grad_norm": 5158.016326959571, + "learning_rate": 4.226418088856335e-07, + "logits/chosen": -2.48799467086792, + "logits/rejected": -2.589542865753174, + "logps/chosen": -228.2854766845703, + "logps/rejected": -296.7008972167969, + "loss": 479.6581, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 33.118438720703125, + "rewards/margins": -2.1525566577911377, + "rewards/rejected": 35.27099609375, + "step": 1690 + }, + { + "epoch": 0.3336604514229637, + "grad_norm": 4286.561139019483, + "learning_rate": 4.2139882699413613e-07, + "logits/chosen": -2.692596673965454, + "logits/rejected": -2.530569314956665, + "logps/chosen": -234.590087890625, + "logps/rejected": -154.15579223632812, + "loss": 410.8604, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 37.41648864746094, + "rewards/margins": 0.9406414031982422, + "rewards/rejected": 36.47584915161133, + "step": 1700 + }, + { + "epoch": 0.3356231599607458, + "grad_norm": 4907.107578923295, + "learning_rate": 4.2014779822844274e-07, + "logits/chosen": -2.5812151432037354, + "logits/rejected": -2.5249361991882324, + "logps/chosen": -194.2213897705078, + "logps/rejected": -230.5895538330078, + "loss": 483.7218, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 35.815399169921875, + "rewards/margins": 10.298858642578125, + "rewards/rejected": 25.516538619995117, + "step": 1710 + }, + { + "epoch": 0.33758586849852795, + "grad_norm": 5150.776933193368, + "learning_rate": 4.18888781322159e-07, + "logits/chosen": -2.5948588848114014, + "logits/rejected": -2.361288547515869, + "logps/chosen": -212.5074462890625, + "logps/rejected": -220.21035766601562, + "loss": 468.4072, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 34.529293060302734, + "rewards/margins": -2.4311232566833496, + "rewards/rejected": 36.96042251586914, + "step": 1720 + }, + { + "epoch": 0.3395485770363101, + "grad_norm": 4734.509257051262, + "learning_rate": 4.176218353839195e-07, + "logits/chosen": -2.6863338947296143, + "logits/rejected": -2.690185308456421, + "logps/chosen": -226.93643188476562, + "logps/rejected": -184.9244842529297, + "loss": 455.7779, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 35.38648223876953, + "rewards/margins": 10.249422073364258, + "rewards/rejected": 25.137056350708008, + "step": 1730 + }, + { + "epoch": 0.34151128557409227, + "grad_norm": 4277.488404749702, + "learning_rate": 4.1634701989461325e-07, + "logits/chosen": -2.5563833713531494, + "logits/rejected": -2.575597047805786, + "logps/chosen": -227.07296752929688, + "logps/rejected": -235.7079315185547, + "loss": 463.36, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 41.53756332397461, + "rewards/margins": -0.9080625772476196, + "rewards/rejected": 42.44562911987305, + "step": 1740 + }, + { + "epoch": 0.3434739941118744, + "grad_norm": 5121.3329084397665, + "learning_rate": 4.1506439470459056e-07, + "logits/chosen": -2.5935757160186768, + "logits/rejected": -2.6307332515716553, + "logps/chosen": -204.5602569580078, + "logps/rejected": -185.46250915527344, + "loss": 415.0569, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 29.67550277709961, + "rewards/margins": -6.236639976501465, + "rewards/rejected": 35.91214370727539, + "step": 1750 + }, + { + "epoch": 0.34543670264965654, + "grad_norm": 4288.60724056639, + "learning_rate": 4.137740200308537e-07, + "logits/chosen": -2.769371509552002, + "logits/rejected": -2.630138874053955, + "logps/chosen": -255.390625, + "logps/rejected": -238.37637329101562, + "loss": 447.6325, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": 35.000877380371094, + "rewards/margins": 6.092411994934082, + "rewards/rejected": 28.90846824645996, + "step": 1760 + }, + { + "epoch": 0.34739941118743867, + "grad_norm": 4066.385997799853, + "learning_rate": 4.124759564542295e-07, + "logits/chosen": -2.6266300678253174, + "logits/rejected": -2.5432302951812744, + "logps/chosen": -257.3196716308594, + "logps/rejected": -187.99392700195312, + "loss": 444.7196, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 38.820655822753906, + "rewards/margins": 14.107760429382324, + "rewards/rejected": 24.712894439697266, + "step": 1770 + }, + { + "epoch": 0.3493621197252208, + "grad_norm": 4219.533318286452, + "learning_rate": 4.111702649165255e-07, + "logits/chosen": -2.7003438472747803, + "logits/rejected": -2.574720859527588, + "logps/chosen": -214.9884490966797, + "logps/rejected": -178.40650939941406, + "loss": 396.0387, + "rewards/accuracies": 0.533333420753479, + "rewards/chosen": 36.658668518066406, + "rewards/margins": -0.34727534651756287, + "rewards/rejected": 37.005943298339844, + "step": 1780 + }, + { + "epoch": 0.35132482826300293, + "grad_norm": 4522.624284940719, + "learning_rate": 4.0985700671766834e-07, + "logits/chosen": -2.5886049270629883, + "logits/rejected": -2.4019060134887695, + "logps/chosen": -305.7603454589844, + "logps/rejected": -250.5217742919922, + "loss": 504.2884, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 49.30162811279297, + "rewards/margins": 18.17063331604004, + "rewards/rejected": 31.130992889404297, + "step": 1790 + }, + { + "epoch": 0.35328753680078506, + "grad_norm": 4148.605801079037, + "learning_rate": 4.085362435128262e-07, + "logits/chosen": -2.5969197750091553, + "logits/rejected": -2.430412530899048, + "logps/chosen": -253.0435028076172, + "logps/rejected": -213.9705047607422, + "loss": 450.6715, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 34.25432586669922, + "rewards/margins": -2.4074110984802246, + "rewards/rejected": 36.66173553466797, + "step": 1800 + }, + { + "epoch": 0.35525024533856725, + "grad_norm": 4383.445324888864, + "learning_rate": 4.0720803730951423e-07, + "logits/chosen": -2.7193925380706787, + "logits/rejected": -2.553121566772461, + "logps/chosen": -273.88330078125, + "logps/rejected": -160.2400665283203, + "loss": 465.7694, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 36.27952194213867, + "rewards/margins": 3.4309074878692627, + "rewards/rejected": 32.84861373901367, + "step": 1810 + }, + { + "epoch": 0.3572129538763494, + "grad_norm": 4650.197621021058, + "learning_rate": 4.058724504646834e-07, + "logits/chosen": -2.537329912185669, + "logits/rejected": -2.4770290851593018, + "logps/chosen": -188.79598999023438, + "logps/rejected": -176.62588500976562, + "loss": 420.7817, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 39.98280715942383, + "rewards/margins": -10.657835006713867, + "rewards/rejected": 50.64064407348633, + "step": 1820 + }, + { + "epoch": 0.3591756624141315, + "grad_norm": 5154.40174453461, + "learning_rate": 4.045295456817924e-07, + "logits/chosen": -2.560668706893921, + "logits/rejected": -2.5223541259765625, + "logps/chosen": -246.7219696044922, + "logps/rejected": -244.5557098388672, + "loss": 492.5268, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 30.156757354736328, + "rewards/margins": -3.125023126602173, + "rewards/rejected": 33.281776428222656, + "step": 1830 + }, + { + "epoch": 0.36113837095191365, + "grad_norm": 4277.633576382915, + "learning_rate": 4.0317938600786484e-07, + "logits/chosen": -2.63722562789917, + "logits/rejected": -2.671647071838379, + "logps/chosen": -261.1764831542969, + "logps/rejected": -239.5066680908203, + "loss": 515.1007, + "rewards/accuracies": 0.43333330750465393, + "rewards/chosen": 33.79315948486328, + "rewards/margins": -2.2674169540405273, + "rewards/rejected": 36.06057357788086, + "step": 1840 + }, + { + "epoch": 0.3631010794896958, + "grad_norm": 4849.989172933969, + "learning_rate": 4.0182203483052825e-07, + "logits/chosen": -2.4976391792297363, + "logits/rejected": -2.4753127098083496, + "logps/chosen": -247.03573608398438, + "logps/rejected": -167.9617919921875, + "loss": 464.8464, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 33.916893005371094, + "rewards/margins": -3.894451856613159, + "rewards/rejected": 37.81134796142578, + "step": 1850 + }, + { + "epoch": 0.3650637880274779, + "grad_norm": 5305.396800713257, + "learning_rate": 4.004575558750389e-07, + "logits/chosen": -2.7480387687683105, + "logits/rejected": -2.552293300628662, + "logps/chosen": -307.5226135253906, + "logps/rejected": -270.4818420410156, + "loss": 515.8467, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 43.089561462402344, + "rewards/margins": 12.114897727966309, + "rewards/rejected": 30.97466468811035, + "step": 1860 + }, + { + "epoch": 0.36702649656526004, + "grad_norm": 3766.794463439121, + "learning_rate": 3.9908601320128976e-07, + "logits/chosen": -2.347365140914917, + "logits/rejected": -2.384580135345459, + "logps/chosen": -201.86337280273438, + "logps/rejected": -206.92623901367188, + "loss": 455.4958, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 35.576011657714844, + "rewards/margins": 0.6420882940292358, + "rewards/rejected": 34.93392562866211, + "step": 1870 + }, + { + "epoch": 0.3689892051030422, + "grad_norm": 5137.166654216243, + "learning_rate": 3.9770747120080284e-07, + "logits/chosen": -2.542323112487793, + "logits/rejected": -2.541592836380005, + "logps/chosen": -189.4404754638672, + "logps/rejected": -167.32725524902344, + "loss": 418.2102, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 36.097442626953125, + "rewards/margins": -5.9926018714904785, + "rewards/rejected": 42.09004592895508, + "step": 1880 + }, + { + "epoch": 0.37095191364082436, + "grad_norm": 4046.4653595977607, + "learning_rate": 3.963219945937063e-07, + "logits/chosen": -2.5712907314300537, + "logits/rejected": -2.4558281898498535, + "logps/chosen": -199.93008422851562, + "logps/rejected": -189.0198974609375, + "loss": 478.3735, + "rewards/accuracies": 0.5, + "rewards/chosen": 33.24176788330078, + "rewards/margins": -3.3829407691955566, + "rewards/rejected": 36.62471389770508, + "step": 1890 + }, + { + "epoch": 0.3729146221786065, + "grad_norm": 5011.92916099243, + "learning_rate": 3.949296484256959e-07, + "logits/chosen": -2.6864089965820312, + "logits/rejected": -2.579397201538086, + "logps/chosen": -211.36483764648438, + "logps/rejected": -207.35995483398438, + "loss": 417.3505, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 43.83955383300781, + "rewards/margins": 7.592188358306885, + "rewards/rejected": 36.24736785888672, + "step": 1900 + }, + { + "epoch": 0.3748773307163886, + "grad_norm": 4019.496297350411, + "learning_rate": 3.935304980649813e-07, + "logits/chosen": -2.607959032058716, + "logits/rejected": -2.6045727729797363, + "logps/chosen": -265.7958068847656, + "logps/rejected": -236.3028564453125, + "loss": 426.1907, + "rewards/accuracies": 0.36666664481163025, + "rewards/chosen": 34.28295135498047, + "rewards/margins": -8.695396423339844, + "rewards/rejected": 42.97834396362305, + "step": 1910 + }, + { + "epoch": 0.37684003925417076, + "grad_norm": 4356.423244834971, + "learning_rate": 3.92124609199217e-07, + "logits/chosen": -2.481682538986206, + "logits/rejected": -2.5561962127685547, + "logps/chosen": -168.49618530273438, + "logps/rejected": -182.44041442871094, + "loss": 407.5496, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 35.77836990356445, + "rewards/margins": 0.9616115689277649, + "rewards/rejected": 34.81675720214844, + "step": 1920 + }, + { + "epoch": 0.3788027477919529, + "grad_norm": 4486.626433944729, + "learning_rate": 3.907120478324185e-07, + "logits/chosen": -2.5384328365325928, + "logits/rejected": -2.544644832611084, + "logps/chosen": -250.89175415039062, + "logps/rejected": -236.45236206054688, + "loss": 485.9587, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 38.95748519897461, + "rewards/margins": 4.464973449707031, + "rewards/rejected": 34.492515563964844, + "step": 1930 + }, + { + "epoch": 0.380765456329735, + "grad_norm": 5957.488621544725, + "learning_rate": 3.8929288028186364e-07, + "logits/chosen": -2.544926166534424, + "logits/rejected": -2.453709363937378, + "logps/chosen": -185.8582763671875, + "logps/rejected": -159.22975158691406, + "loss": 432.8231, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 33.13045120239258, + "rewards/margins": -4.596245765686035, + "rewards/rejected": 37.7266960144043, + "step": 1940 + }, + { + "epoch": 0.38272816486751715, + "grad_norm": 4577.262664079208, + "learning_rate": 3.8786717317497875e-07, + "logits/chosen": -2.484395980834961, + "logits/rejected": -2.5102570056915283, + "logps/chosen": -275.0531311035156, + "logps/rejected": -228.39883422851562, + "loss": 430.9164, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 38.611968994140625, + "rewards/margins": -3.3621764183044434, + "rewards/rejected": 41.974151611328125, + "step": 1950 + }, + { + "epoch": 0.38469087340529934, + "grad_norm": 4253.196394738559, + "learning_rate": 3.864349934462111e-07, + "logits/chosen": -2.5822837352752686, + "logits/rejected": -2.562549114227295, + "logps/chosen": -227.0452423095703, + "logps/rejected": -201.46873474121094, + "loss": 461.1355, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 35.39836120605469, + "rewards/margins": -8.067391395568848, + "rewards/rejected": 43.46575164794922, + "step": 1960 + }, + { + "epoch": 0.38665358194308147, + "grad_norm": 4509.157666462303, + "learning_rate": 3.84996408333886e-07, + "logits/chosen": -2.7467126846313477, + "logits/rejected": -2.5748817920684814, + "logps/chosen": -269.403076171875, + "logps/rejected": -185.69235229492188, + "loss": 484.4049, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 36.83091735839844, + "rewards/margins": 1.1465529203414917, + "rewards/rejected": 35.684364318847656, + "step": 1970 + }, + { + "epoch": 0.3886162904808636, + "grad_norm": 4336.3496772389035, + "learning_rate": 3.8355148537705047e-07, + "logits/chosen": -2.700141191482544, + "logits/rejected": -2.7247486114501953, + "logps/chosen": -200.31036376953125, + "logps/rejected": -166.716064453125, + "loss": 438.6323, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 30.3387393951416, + "rewards/margins": -7.581428527832031, + "rewards/rejected": 37.920169830322266, + "step": 1980 + }, + { + "epoch": 0.39057899901864573, + "grad_norm": 5348.9980937018945, + "learning_rate": 3.8210029241230204e-07, + "logits/chosen": -2.7195792198181152, + "logits/rejected": -2.5863280296325684, + "logps/chosen": -295.79119873046875, + "logps/rejected": -250.6574249267578, + "loss": 559.4731, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": 52.1451416015625, + "rewards/margins": 19.041133880615234, + "rewards/rejected": 33.10401153564453, + "step": 1990 + }, + { + "epoch": 0.39254170755642787, + "grad_norm": 4816.794446517075, + "learning_rate": 3.806428975706042e-07, + "logits/chosen": -2.3942437171936035, + "logits/rejected": -2.3860244750976562, + "logps/chosen": -181.06466674804688, + "logps/rejected": -195.17808532714844, + "loss": 387.9617, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 53.385047912597656, + "rewards/margins": 18.948444366455078, + "rewards/rejected": 34.43659973144531, + "step": 2000 + }, + { + "epoch": 0.39450441609421, + "grad_norm": 6070.466031055457, + "learning_rate": 3.791793692740876e-07, + "logits/chosen": -2.4864609241485596, + "logits/rejected": -2.498897075653076, + "logps/chosen": -194.35671997070312, + "logps/rejected": -140.57203674316406, + "loss": 423.7441, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 38.10416793823242, + "rewards/margins": 10.152827262878418, + "rewards/rejected": 27.951335906982422, + "step": 2010 + }, + { + "epoch": 0.39646712463199213, + "grad_norm": 4731.301435131496, + "learning_rate": 3.777097762328381e-07, + "logits/chosen": -2.6302943229675293, + "logits/rejected": -2.582368850708008, + "logps/chosen": -262.92559814453125, + "logps/rejected": -223.01577758789062, + "loss": 458.122, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 33.463165283203125, + "rewards/margins": -4.165156364440918, + "rewards/rejected": 37.62831497192383, + "step": 2020 + }, + { + "epoch": 0.39842983316977426, + "grad_norm": 3182.878603855419, + "learning_rate": 3.762341874416702e-07, + "logits/chosen": -2.4996063709259033, + "logits/rejected": -2.3436052799224854, + "logps/chosen": -191.55349731445312, + "logps/rejected": -135.09239196777344, + "loss": 406.8957, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 35.906471252441406, + "rewards/margins": 7.290860652923584, + "rewards/rejected": 28.615610122680664, + "step": 2030 + }, + { + "epoch": 0.40039254170755645, + "grad_norm": 4112.393036976447, + "learning_rate": 3.7475267217688896e-07, + "logits/chosen": -2.5368053913116455, + "logits/rejected": -2.6471173763275146, + "logps/chosen": -165.9794158935547, + "logps/rejected": -209.2161407470703, + "loss": 420.6569, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 34.43918991088867, + "rewards/margins": -9.173788070678711, + "rewards/rejected": 43.61297607421875, + "step": 2040 + }, + { + "epoch": 0.4023552502453386, + "grad_norm": 4158.868734597658, + "learning_rate": 3.7326529999303633e-07, + "logits/chosen": -2.4712069034576416, + "logits/rejected": -2.466320753097534, + "logps/chosen": -183.16184997558594, + "logps/rejected": -216.8333740234375, + "loss": 419.9413, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 32.53175354003906, + "rewards/margins": -8.037252426147461, + "rewards/rejected": 40.569000244140625, + "step": 2050 + }, + { + "epoch": 0.4043179587831207, + "grad_norm": 4584.7452639371095, + "learning_rate": 3.7177214071962684e-07, + "logits/chosen": -2.5812697410583496, + "logits/rejected": -2.6067354679107666, + "logps/chosen": -212.4852752685547, + "logps/rejected": -277.4024353027344, + "loss": 485.5656, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 39.76322555541992, + "rewards/margins": 2.1475565433502197, + "rewards/rejected": 37.615665435791016, + "step": 2060 + }, + { + "epoch": 0.40628066732090284, + "grad_norm": 4168.50229175131, + "learning_rate": 3.7027326445786835e-07, + "logits/chosen": -2.6407763957977295, + "logits/rejected": -2.5833470821380615, + "logps/chosen": -214.7595977783203, + "logps/rejected": -206.1675262451172, + "loss": 450.9582, + "rewards/accuracies": 0.5, + "rewards/chosen": 41.8992805480957, + "rewards/margins": 10.203690528869629, + "rewards/rejected": 31.69559097290039, + "step": 2070 + }, + { + "epoch": 0.408243375858685, + "grad_norm": 4744.500799794224, + "learning_rate": 3.6876874157737167e-07, + "logits/chosen": -2.6174182891845703, + "logits/rejected": -2.624634265899658, + "logps/chosen": -227.5395050048828, + "logps/rejected": -241.0737762451172, + "loss": 471.8342, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 33.56273651123047, + "rewards/margins": -9.664677619934082, + "rewards/rejected": 43.227413177490234, + "step": 2080 + }, + { + "epoch": 0.4102060843964671, + "grad_norm": 4440.846443142311, + "learning_rate": 3.67258642712846e-07, + "logits/chosen": -2.6877191066741943, + "logits/rejected": -2.5582659244537354, + "logps/chosen": -216.27066040039062, + "logps/rejected": -167.98318481445312, + "loss": 436.4445, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 37.20050048828125, + "rewards/margins": 7.238005638122559, + "rewards/rejected": 29.96249771118164, + "step": 2090 + }, + { + "epoch": 0.41216879293424924, + "grad_norm": 4600.09562625597, + "learning_rate": 3.6574303876078366e-07, + "logits/chosen": -2.7248592376708984, + "logits/rejected": -2.645047664642334, + "logps/chosen": -243.5182342529297, + "logps/rejected": -232.8569793701172, + "loss": 449.7953, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 39.2998046875, + "rewards/margins": -1.4524590969085693, + "rewards/rejected": 40.75226593017578, + "step": 2100 + }, + { + "epoch": 0.4141315014720314, + "grad_norm": 5993.644361974152, + "learning_rate": 3.642220008761309e-07, + "logits/chosen": -2.7186810970306396, + "logits/rejected": -2.6466875076293945, + "logps/chosen": -287.5413513183594, + "logps/rejected": -254.6502227783203, + "loss": 533.4502, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 40.02311706542969, + "rewards/margins": 7.727900505065918, + "rewards/rejected": 32.29521942138672, + "step": 2110 + }, + { + "epoch": 0.41609421000981356, + "grad_norm": 3941.922949118662, + "learning_rate": 3.626956004689476e-07, + "logits/chosen": -2.73811411857605, + "logits/rejected": -2.5903234481811523, + "logps/chosen": -314.7489929199219, + "logps/rejected": -184.81614685058594, + "loss": 418.1722, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 46.809268951416016, + "rewards/margins": 16.42887306213379, + "rewards/rejected": 30.380395889282227, + "step": 2120 + }, + { + "epoch": 0.4180569185475957, + "grad_norm": 4994.62562305448, + "learning_rate": 3.6116390920105474e-07, + "logits/chosen": -2.7437849044799805, + "logits/rejected": -2.695708990097046, + "logps/chosen": -231.01681518554688, + "logps/rejected": -200.68460083007812, + "loss": 488.287, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 36.310768127441406, + "rewards/margins": 2.005819320678711, + "rewards/rejected": 34.30495071411133, + "step": 2130 + }, + { + "epoch": 0.4200196270853778, + "grad_norm": 4375.304026236624, + "learning_rate": 3.5962699898266983e-07, + "logits/chosen": -2.6645920276641846, + "logits/rejected": -2.5936334133148193, + "logps/chosen": -211.1452178955078, + "logps/rejected": -186.7575225830078, + "loss": 419.7613, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 32.87139129638672, + "rewards/margins": 3.3978514671325684, + "rewards/rejected": 29.473541259765625, + "step": 2140 + }, + { + "epoch": 0.42198233562315995, + "grad_norm": 3665.1490476903364, + "learning_rate": 3.5808494196903117e-07, + "logits/chosen": -2.5833377838134766, + "logits/rejected": -2.525230884552002, + "logps/chosen": -293.2630615234375, + "logps/rejected": -173.25833129882812, + "loss": 479.6071, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 32.98417282104492, + "rewards/margins": -8.318937301635742, + "rewards/rejected": 41.3031120300293, + "step": 2150 + }, + { + "epoch": 0.4239450441609421, + "grad_norm": 4315.555600452838, + "learning_rate": 3.565378105570097e-07, + "logits/chosen": -2.62446665763855, + "logits/rejected": -2.5740838050842285, + "logps/chosen": -244.9754638671875, + "logps/rejected": -180.99819946289062, + "loss": 438.9187, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": 39.099754333496094, + "rewards/margins": 12.707671165466309, + "rewards/rejected": 26.392078399658203, + "step": 2160 + }, + { + "epoch": 0.4259077526987242, + "grad_norm": 5510.7039652280155, + "learning_rate": 3.549856773817107e-07, + "logits/chosen": -2.575657367706299, + "logits/rejected": -2.4953224658966064, + "logps/chosen": -205.1666717529297, + "logps/rejected": -187.93984985351562, + "loss": 485.9353, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 33.390872955322266, + "rewards/margins": 5.281000137329102, + "rewards/rejected": 28.109872817993164, + "step": 2170 + }, + { + "epoch": 0.4278704612365064, + "grad_norm": 5120.327309285725, + "learning_rate": 3.5342861531306344e-07, + "logits/chosen": -2.593902111053467, + "logits/rejected": -2.543280601501465, + "logps/chosen": -217.9532928466797, + "logps/rejected": -180.51889038085938, + "loss": 467.9969, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 36.239341735839844, + "rewards/margins": 0.7464396357536316, + "rewards/rejected": 35.492897033691406, + "step": 2180 + }, + { + "epoch": 0.42983316977428854, + "grad_norm": 4710.801534208424, + "learning_rate": 3.518666974524002e-07, + "logits/chosen": -2.6590843200683594, + "logits/rejected": -2.586304187774658, + "logps/chosen": -280.81085205078125, + "logps/rejected": -236.250732421875, + "loss": 466.6427, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 35.222198486328125, + "rewards/margins": -1.8785030841827393, + "rewards/rejected": 37.10070037841797, + "step": 2190 + }, + { + "epoch": 0.43179587831207067, + "grad_norm": 4639.368213133138, + "learning_rate": 3.5029999712902387e-07, + "logits/chosen": -2.7458598613739014, + "logits/rejected": -2.742999792098999, + "logps/chosen": -292.08331298828125, + "logps/rejected": -315.49456787109375, + "loss": 494.2247, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 37.22055435180664, + "rewards/margins": -1.9078445434570312, + "rewards/rejected": 39.12839889526367, + "step": 2200 + }, + { + "epoch": 0.4337585868498528, + "grad_norm": 4075.814076061963, + "learning_rate": 3.4872858789676583e-07, + "logits/chosen": -2.4446651935577393, + "logits/rejected": -2.519179582595825, + "logps/chosen": -198.51075744628906, + "logps/rejected": -195.6839599609375, + "loss": 423.1005, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 34.416419982910156, + "rewards/margins": -4.6278910636901855, + "rewards/rejected": 39.044315338134766, + "step": 2210 + }, + { + "epoch": 0.43572129538763493, + "grad_norm": 5623.558616130809, + "learning_rate": 3.4715254353053236e-07, + "logits/chosen": -2.5616421699523926, + "logits/rejected": -2.620027780532837, + "logps/chosen": -235.35067749023438, + "logps/rejected": -243.73171997070312, + "loss": 478.7932, + "rewards/accuracies": 0.3333333134651184, + "rewards/chosen": 36.581520080566406, + "rewards/margins": -0.6448574066162109, + "rewards/rejected": 37.226375579833984, + "step": 2220 + }, + { + "epoch": 0.43768400392541706, + "grad_norm": 4862.19129057131, + "learning_rate": 3.4557193802284123e-07, + "logits/chosen": -2.5887420177459717, + "logits/rejected": -2.5235865116119385, + "logps/chosen": -227.75057983398438, + "logps/rejected": -214.9437255859375, + "loss": 468.0142, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 36.97982406616211, + "rewards/margins": -6.219507694244385, + "rewards/rejected": 43.19933319091797, + "step": 2230 + }, + { + "epoch": 0.4396467124631992, + "grad_norm": 4734.63707338868, + "learning_rate": 3.4398684558034763e-07, + "logits/chosen": -2.451140880584717, + "logits/rejected": -2.4948220252990723, + "logps/chosen": -226.34683227539062, + "logps/rejected": -202.0099639892578, + "loss": 465.8895, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 42.129207611083984, + "rewards/margins": -5.78919792175293, + "rewards/rejected": 47.91840362548828, + "step": 2240 + }, + { + "epoch": 0.44160942100098133, + "grad_norm": 6076.960687119696, + "learning_rate": 3.4239734062036067e-07, + "logits/chosen": -2.6000795364379883, + "logits/rejected": -2.6038219928741455, + "logps/chosen": -245.7303466796875, + "logps/rejected": -232.3347930908203, + "loss": 441.4258, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 38.31798553466797, + "rewards/margins": -9.228495597839355, + "rewards/rejected": 47.546485900878906, + "step": 2250 + }, + { + "epoch": 0.4435721295387635, + "grad_norm": 4559.100462974265, + "learning_rate": 3.4080349776734924e-07, + "logits/chosen": -2.5710062980651855, + "logits/rejected": -2.5078659057617188, + "logps/chosen": -257.9673767089844, + "logps/rejected": -238.927734375, + "loss": 445.9494, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 40.69451141357422, + "rewards/margins": 1.9841305017471313, + "rewards/rejected": 38.71038818359375, + "step": 2260 + }, + { + "epoch": 0.44553483807654565, + "grad_norm": 4245.220565699372, + "learning_rate": 3.392053918494389e-07, + "logits/chosen": -2.533465623855591, + "logits/rejected": -2.56412672996521, + "logps/chosen": -275.1715087890625, + "logps/rejected": -240.7425994873047, + "loss": 470.4912, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 31.101980209350586, + "rewards/margins": 1.7008116245269775, + "rewards/rejected": 29.401172637939453, + "step": 2270 + }, + { + "epoch": 0.4474975466143278, + "grad_norm": 4162.537615965158, + "learning_rate": 3.376030978948983e-07, + "logits/chosen": -2.4892563819885254, + "logits/rejected": -2.392144203186035, + "logps/chosen": -279.1177062988281, + "logps/rejected": -255.7156524658203, + "loss": 457.0973, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 43.22492218017578, + "rewards/margins": 11.627494812011719, + "rewards/rejected": 31.597427368164062, + "step": 2280 + }, + { + "epoch": 0.4494602551521099, + "grad_norm": 5230.945935986782, + "learning_rate": 3.3599669112861756e-07, + "logits/chosen": -2.6333365440368652, + "logits/rejected": -2.6040003299713135, + "logps/chosen": -226.58486938476562, + "logps/rejected": -245.96005249023438, + "loss": 477.1376, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 42.69200897216797, + "rewards/margins": -3.276989698410034, + "rewards/rejected": 45.968997955322266, + "step": 2290 + }, + { + "epoch": 0.45142296368989204, + "grad_norm": 3929.202252727979, + "learning_rate": 3.343862469685755e-07, + "logits/chosen": -2.5004894733428955, + "logits/rejected": -2.4886584281921387, + "logps/chosen": -208.4034881591797, + "logps/rejected": -188.3735809326172, + "loss": 456.2935, + "rewards/accuracies": 0.5, + "rewards/chosen": 31.812917709350586, + "rewards/margins": -6.975862979888916, + "rewards/rejected": 38.788780212402344, + "step": 2300 + }, + { + "epoch": 0.4533856722276742, + "grad_norm": 5214.156588475492, + "learning_rate": 3.3277184102230004e-07, + "logits/chosen": -2.7178711891174316, + "logits/rejected": -2.724022388458252, + "logps/chosen": -234.6586456298828, + "logps/rejected": -246.9966583251953, + "loss": 499.9511, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 34.978572845458984, + "rewards/margins": 7.584236145019531, + "rewards/rejected": 27.394336700439453, + "step": 2310 + }, + { + "epoch": 0.4553483807654563, + "grad_norm": 4575.858557653078, + "learning_rate": 3.311535490833176e-07, + "logits/chosen": -2.4982151985168457, + "logits/rejected": -2.4811289310455322, + "logps/chosen": -194.088134765625, + "logps/rejected": -242.5849151611328, + "loss": 499.5238, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 54.86347198486328, + "rewards/margins": 10.496244430541992, + "rewards/rejected": 44.36722946166992, + "step": 2320 + }, + { + "epoch": 0.4573110893032385, + "grad_norm": 4641.516621248123, + "learning_rate": 3.2953144712759537e-07, + "logits/chosen": -2.662986993789673, + "logits/rejected": -2.5589983463287354, + "logps/chosen": -304.39935302734375, + "logps/rejected": -216.46932983398438, + "loss": 479.5048, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 38.267696380615234, + "rewards/margins": 5.254390716552734, + "rewards/rejected": 33.013301849365234, + "step": 2330 + }, + { + "epoch": 0.4592737978410206, + "grad_norm": 4408.044494580562, + "learning_rate": 3.279056113099742e-07, + "logits/chosen": -2.619048833847046, + "logits/rejected": -2.541274309158325, + "logps/chosen": -247.12771606445312, + "logps/rejected": -290.5010681152344, + "loss": 460.61, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 40.39784240722656, + "rewards/margins": 1.5373001098632812, + "rewards/rejected": 38.86054229736328, + "step": 2340 + }, + { + "epoch": 0.46123650637880276, + "grad_norm": 4233.056941895596, + "learning_rate": 3.2627611796059283e-07, + "logits/chosen": -2.5531439781188965, + "logits/rejected": -2.4957103729248047, + "logps/chosen": -233.532958984375, + "logps/rejected": -212.55191040039062, + "loss": 494.6469, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 39.436241149902344, + "rewards/margins": 13.565861701965332, + "rewards/rejected": 25.870376586914062, + "step": 2350 + }, + { + "epoch": 0.4631992149165849, + "grad_norm": 5123.462575919028, + "learning_rate": 3.246430435813051e-07, + "logits/chosen": -2.6400294303894043, + "logits/rejected": -2.5540122985839844, + "logps/chosen": -237.27371215820312, + "logps/rejected": -184.3572235107422, + "loss": 444.0543, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 36.393699645996094, + "rewards/margins": 2.897949695587158, + "rewards/rejected": 33.495750427246094, + "step": 2360 + }, + { + "epoch": 0.465161923454367, + "grad_norm": 3504.0236788538973, + "learning_rate": 3.230064648420878e-07, + "logits/chosen": -2.6284031867980957, + "logits/rejected": -2.446946382522583, + "logps/chosen": -233.88278198242188, + "logps/rejected": -162.52059936523438, + "loss": 412.3497, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 42.815452575683594, + "rewards/margins": 12.520002365112305, + "rewards/rejected": 30.29545021057129, + "step": 2370 + }, + { + "epoch": 0.46712463199214915, + "grad_norm": 4766.94598434386, + "learning_rate": 3.2136645857744114e-07, + "logits/chosen": -2.397789239883423, + "logits/rejected": -2.3977391719818115, + "logps/chosen": -189.5882568359375, + "logps/rejected": -226.1632843017578, + "loss": 447.2617, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 42.929969787597656, + "rewards/margins": -7.894274711608887, + "rewards/rejected": 50.824241638183594, + "step": 2380 + }, + { + "epoch": 0.4690873405299313, + "grad_norm": 4702.600347462076, + "learning_rate": 3.197231017827818e-07, + "logits/chosen": -2.5886404514312744, + "logits/rejected": -2.542672872543335, + "logps/chosen": -241.9503173828125, + "logps/rejected": -225.9481658935547, + "loss": 400.5882, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 39.55445098876953, + "rewards/margins": 4.344472408294678, + "rewards/rejected": 35.20997619628906, + "step": 2390 + }, + { + "epoch": 0.47105004906771347, + "grad_norm": 5874.7133346146, + "learning_rate": 3.1807647161082797e-07, + "logits/chosen": -2.6827895641326904, + "logits/rejected": -2.5796356201171875, + "logps/chosen": -223.42575073242188, + "logps/rejected": -222.6304168701172, + "loss": 508.7384, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 51.37737274169922, + "rewards/margins": 16.392112731933594, + "rewards/rejected": 34.98526382446289, + "step": 2400 + }, + { + "epoch": 0.4730127576054956, + "grad_norm": 4080.6756176221616, + "learning_rate": 3.1642664536797693e-07, + "logits/chosen": -2.5640296936035156, + "logits/rejected": -2.45853328704834, + "logps/chosen": -232.3906707763672, + "logps/rejected": -225.85330200195312, + "loss": 418.0434, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 40.605072021484375, + "rewards/margins": -7.888661861419678, + "rewards/rejected": 48.49373245239258, + "step": 2410 + }, + { + "epoch": 0.47497546614327774, + "grad_norm": 4602.357148985481, + "learning_rate": 3.147737005106762e-07, + "logits/chosen": -2.554788112640381, + "logits/rejected": -2.701284646987915, + "logps/chosen": -270.83477783203125, + "logps/rejected": -232.8144073486328, + "loss": 454.253, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 28.20511817932129, + "rewards/margins": -13.63386058807373, + "rewards/rejected": 41.8389778137207, + "step": 2420 + }, + { + "epoch": 0.47693817468105987, + "grad_norm": 5186.892444597569, + "learning_rate": 3.1311771464178655e-07, + "logits/chosen": -2.549485683441162, + "logits/rejected": -2.512749195098877, + "logps/chosen": -263.4794616699219, + "logps/rejected": -182.04495239257812, + "loss": 462.0962, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 31.768932342529297, + "rewards/margins": -2.404651165008545, + "rewards/rejected": 34.173583984375, + "step": 2430 + }, + { + "epoch": 0.478900883218842, + "grad_norm": 5048.1658714524065, + "learning_rate": 3.1145876550693893e-07, + "logits/chosen": -2.690701961517334, + "logits/rejected": -2.6157898902893066, + "logps/chosen": -246.66543579101562, + "logps/rejected": -191.4427032470703, + "loss": 429.1527, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 39.08533477783203, + "rewards/margins": -3.444082260131836, + "rewards/rejected": 42.5294189453125, + "step": 2440 + }, + { + "epoch": 0.48086359175662413, + "grad_norm": 4525.940898564814, + "learning_rate": 3.097969309908847e-07, + "logits/chosen": -2.383237838745117, + "logits/rejected": -2.433600902557373, + "logps/chosen": -188.96987915039062, + "logps/rejected": -172.08071899414062, + "loss": 408.2894, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 50.24271774291992, + "rewards/margins": 15.582735061645508, + "rewards/rejected": 34.65998077392578, + "step": 2450 + }, + { + "epoch": 0.48282630029440626, + "grad_norm": 5357.565333054472, + "learning_rate": 3.081322891138382e-07, + "logits/chosen": -2.651235818862915, + "logits/rejected": -2.6922688484191895, + "logps/chosen": -265.7408142089844, + "logps/rejected": -224.93313598632812, + "loss": 480.8474, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 39.43803787231445, + "rewards/margins": 2.3424158096313477, + "rewards/rejected": 37.095619201660156, + "step": 2460 + }, + { + "epoch": 0.4847890088321884, + "grad_norm": 4624.913071085645, + "learning_rate": 3.0646491802781514e-07, + "logits/chosen": -2.5180201530456543, + "logits/rejected": -2.3734679222106934, + "logps/chosen": -233.11752319335938, + "logps/rejected": -151.3263702392578, + "loss": 458.9539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 33.529136657714844, + "rewards/margins": 4.377078056335449, + "rewards/rejected": 29.152053833007812, + "step": 2470 + }, + { + "epoch": 0.4867517173699706, + "grad_norm": 3849.641695370538, + "learning_rate": 3.047948960129624e-07, + "logits/chosen": -2.4560704231262207, + "logits/rejected": -2.467038631439209, + "logps/chosen": -174.33287048339844, + "logps/rejected": -179.1482696533203, + "loss": 425.778, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 35.16417694091797, + "rewards/margins": -3.221940517425537, + "rewards/rejected": 38.38611602783203, + "step": 2480 + }, + { + "epoch": 0.4887144259077527, + "grad_norm": 4474.0585982753055, + "learning_rate": 3.0312230147388334e-07, + "logits/chosen": -2.7361364364624023, + "logits/rejected": -2.6660802364349365, + "logps/chosen": -265.2712097167969, + "logps/rejected": -250.2879638671875, + "loss": 499.5539, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 43.949073791503906, + "rewards/margins": -13.000185012817383, + "rewards/rejected": 56.94926071166992, + "step": 2490 + }, + { + "epoch": 0.49067713444553485, + "grad_norm": 4876.035584744987, + "learning_rate": 3.01447212935957e-07, + "logits/chosen": -2.598324775695801, + "logits/rejected": -2.6883630752563477, + "logps/chosen": -192.37045288085938, + "logps/rejected": -199.43600463867188, + "loss": 446.1502, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 34.07781219482422, + "rewards/margins": -17.534502029418945, + "rewards/rejected": 51.61231231689453, + "step": 2500 + }, + { + "epoch": 0.492639842983317, + "grad_norm": 4593.399795935327, + "learning_rate": 2.9976970904165104e-07, + "logits/chosen": -2.8025119304656982, + "logits/rejected": -2.6266629695892334, + "logps/chosen": -341.9221496582031, + "logps/rejected": -253.7430419921875, + "loss": 466.9557, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 43.02267837524414, + "rewards/margins": 2.0816879272460938, + "rewards/rejected": 40.94099426269531, + "step": 2510 + }, + { + "epoch": 0.4946025515210991, + "grad_norm": 5402.563955263948, + "learning_rate": 2.980898685468301e-07, + "logits/chosen": -2.6228396892547607, + "logits/rejected": -2.5475149154663086, + "logps/chosen": -258.35101318359375, + "logps/rejected": -198.5154266357422, + "loss": 441.9007, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 41.29050064086914, + "rewards/margins": 7.081448554992676, + "rewards/rejected": 34.20905303955078, + "step": 2520 + }, + { + "epoch": 0.49656526005888124, + "grad_norm": 4280.269687459939, + "learning_rate": 2.96407770317058e-07, + "logits/chosen": -2.4809489250183105, + "logits/rejected": -2.4302022457122803, + "logps/chosen": -176.412353515625, + "logps/rejected": -169.72401428222656, + "loss": 423.0802, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 31.984704971313477, + "rewards/margins": -6.700386047363281, + "rewards/rejected": 38.685096740722656, + "step": 2530 + }, + { + "epoch": 0.4985279685966634, + "grad_norm": 4237.256524133457, + "learning_rate": 2.9472349332389523e-07, + "logits/chosen": -2.536940574645996, + "logits/rejected": -2.4029650688171387, + "logps/chosen": -257.356689453125, + "logps/rejected": -158.67816162109375, + "loss": 460.7807, + "rewards/accuracies": 0.6333332657814026, + "rewards/chosen": 36.0179443359375, + "rewards/margins": -4.86074686050415, + "rewards/rejected": 40.87868881225586, + "step": 2540 + }, + { + "epoch": 0.5004906771344455, + "grad_norm": 4282.243026261946, + "learning_rate": 2.930371166411915e-07, + "logits/chosen": -2.752068042755127, + "logits/rejected": -2.6536898612976074, + "logps/chosen": -276.26544189453125, + "logps/rejected": -274.9800720214844, + "loss": 461.675, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 38.50434494018555, + "rewards/margins": 1.045363426208496, + "rewards/rejected": 37.458984375, + "step": 2550 + }, + { + "epoch": 0.5024533856722276, + "grad_norm": 4716.421144282486, + "learning_rate": 2.913487194413731e-07, + "logits/chosen": -2.5679516792297363, + "logits/rejected": -2.6082217693328857, + "logps/chosen": -239.5904541015625, + "logps/rejected": -245.8770294189453, + "loss": 461.8308, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 43.07164001464844, + "rewards/margins": 9.934675216674805, + "rewards/rejected": 33.136966705322266, + "step": 2560 + }, + { + "epoch": 0.5044160942100098, + "grad_norm": 4660.636941487216, + "learning_rate": 2.896583809917262e-07, + "logits/chosen": -2.5704874992370605, + "logits/rejected": -2.527533769607544, + "logps/chosen": -186.18045043945312, + "logps/rejected": -185.8052215576172, + "loss": 423.5537, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 38.3198356628418, + "rewards/margins": -0.8927882313728333, + "rewards/rejected": 39.212623596191406, + "step": 2570 + }, + { + "epoch": 0.5063788027477919, + "grad_norm": 5162.821735056709, + "learning_rate": 2.879661806506751e-07, + "logits/chosen": -2.5468087196350098, + "logits/rejected": -2.4348654747009277, + "logps/chosen": -238.07852172851562, + "logps/rejected": -274.9657287597656, + "loss": 453.0871, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 34.86083221435547, + "rewards/margins": 0.16193465888500214, + "rewards/rejected": 34.698890686035156, + "step": 2580 + }, + { + "epoch": 0.5083415112855741, + "grad_norm": 4605.722636207217, + "learning_rate": 2.86272197864057e-07, + "logits/chosen": -2.7966628074645996, + "logits/rejected": -2.7884743213653564, + "logps/chosen": -282.0419921875, + "logps/rejected": -219.54385375976562, + "loss": 516.6672, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 38.49335479736328, + "rewards/margins": 3.876373767852783, + "rewards/rejected": 34.61698532104492, + "step": 2590 + }, + { + "epoch": 0.5103042198233563, + "grad_norm": 4389.535360502405, + "learning_rate": 2.845765121613912e-07, + "logits/chosen": -2.5735490322113037, + "logits/rejected": -2.522249937057495, + "logps/chosen": -251.6597900390625, + "logps/rejected": -150.88699340820312, + "loss": 437.7409, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 43.90034484863281, + "rewards/margins": -4.054275989532471, + "rewards/rejected": 47.954627990722656, + "step": 2600 + }, + { + "epoch": 0.5122669283611384, + "grad_norm": 5505.17046958691, + "learning_rate": 2.828792031521464e-07, + "logits/chosen": -2.60178542137146, + "logits/rejected": -2.643982172012329, + "logps/chosen": -269.0943298339844, + "logps/rejected": -254.6349639892578, + "loss": 477.3345, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 45.671730041503906, + "rewards/margins": 5.145081043243408, + "rewards/rejected": 40.52665328979492, + "step": 2610 + }, + { + "epoch": 0.5142296368989205, + "grad_norm": 5794.100559420684, + "learning_rate": 2.811803505220025e-07, + "logits/chosen": -2.621931552886963, + "logits/rejected": -2.460784435272217, + "logps/chosen": -207.79434204101562, + "logps/rejected": -173.03713989257812, + "loss": 446.7151, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 55.721534729003906, + "rewards/margins": 21.065649032592773, + "rewards/rejected": 34.6558837890625, + "step": 2620 + }, + { + "epoch": 0.5161923454367027, + "grad_norm": 4998.841167117095, + "learning_rate": 2.7948003402910975e-07, + "logits/chosen": -2.6204910278320312, + "logits/rejected": -2.52304744720459, + "logps/chosen": -263.1111755371094, + "logps/rejected": -216.44528198242188, + "loss": 435.7576, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 34.93376922607422, + "rewards/margins": -2.444157123565674, + "rewards/rejected": 37.37792205810547, + "step": 2630 + }, + { + "epoch": 0.5181550539744848, + "grad_norm": 4647.687919322091, + "learning_rate": 2.777783335003442e-07, + "logits/chosen": -2.744004249572754, + "logits/rejected": -2.6306076049804688, + "logps/chosen": -273.3846130371094, + "logps/rejected": -217.7678985595703, + "loss": 451.462, + "rewards/accuracies": 0.36666664481163025, + "rewards/chosen": 38.19150161743164, + "rewards/margins": -4.022319793701172, + "rewards/rejected": 42.21382141113281, + "step": 2640 + }, + { + "epoch": 0.5201177625122669, + "grad_norm": 4640.001302888592, + "learning_rate": 2.760753288275598e-07, + "logits/chosen": -2.6173348426818848, + "logits/rejected": -2.473811149597168, + "logps/chosen": -217.0681610107422, + "logps/rejected": -221.2267303466797, + "loss": 402.598, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": 39.30550003051758, + "rewards/margins": 7.629674434661865, + "rewards/rejected": 31.675817489624023, + "step": 2650 + }, + { + "epoch": 0.5220804710500491, + "grad_norm": 4162.675353293604, + "learning_rate": 2.7437109996383795e-07, + "logits/chosen": -2.538184642791748, + "logits/rejected": -2.4441897869110107, + "logps/chosen": -217.38949584960938, + "logps/rejected": -181.7127227783203, + "loss": 473.9091, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 38.0445556640625, + "rewards/margins": 9.555212020874023, + "rewards/rejected": 28.48933982849121, + "step": 2660 + }, + { + "epoch": 0.5240431795878312, + "grad_norm": 4733.653275267734, + "learning_rate": 2.7266572691973365e-07, + "logits/chosen": -2.7847039699554443, + "logits/rejected": -2.7330451011657715, + "logps/chosen": -292.99713134765625, + "logps/rejected": -241.0466766357422, + "loss": 466.7176, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 40.643089294433594, + "rewards/margins": 4.975247859954834, + "rewards/rejected": 35.6678466796875, + "step": 2670 + }, + { + "epoch": 0.5260058881256133, + "grad_norm": 4359.262818451264, + "learning_rate": 2.709592897595191e-07, + "logits/chosen": -2.5485806465148926, + "logits/rejected": -2.4584767818450928, + "logps/chosen": -237.18295288085938, + "logps/rejected": -147.65628051757812, + "loss": 427.6604, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 38.61459732055664, + "rewards/margins": 0.6483393907546997, + "rewards/rejected": 37.96625518798828, + "step": 2680 + }, + { + "epoch": 0.5279685966633955, + "grad_norm": 5233.246097927523, + "learning_rate": 2.6925186859742494e-07, + "logits/chosen": -2.6418886184692383, + "logits/rejected": -2.6851634979248047, + "logps/chosen": -221.16012573242188, + "logps/rejected": -182.71084594726562, + "loss": 468.212, + "rewards/accuracies": 0.36666664481163025, + "rewards/chosen": 36.26055145263672, + "rewards/margins": -8.975361824035645, + "rewards/rejected": 45.23591232299805, + "step": 2690 + }, + { + "epoch": 0.5299313052011776, + "grad_norm": 4190.663619639473, + "learning_rate": 2.675435435938788e-07, + "logits/chosen": -2.690380811691284, + "logits/rejected": -2.6410889625549316, + "logps/chosen": -280.4224548339844, + "logps/rejected": -229.3972625732422, + "loss": 422.0635, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 35.18825149536133, + "rewards/margins": 8.508127212524414, + "rewards/rejected": 26.680124282836914, + "step": 2700 + }, + { + "epoch": 0.5318940137389597, + "grad_norm": 4756.840304078283, + "learning_rate": 2.6583439495174247e-07, + "logits/chosen": -2.7346031665802, + "logits/rejected": -2.6536295413970947, + "logps/chosen": -249.3466339111328, + "logps/rejected": -198.13674926757812, + "loss": 464.5689, + "rewards/accuracies": 0.4999999403953552, + "rewards/chosen": 41.55207443237305, + "rewards/margins": -0.3666302561759949, + "rewards/rejected": 41.918704986572266, + "step": 2710 + }, + { + "epoch": 0.5338567222767419, + "grad_norm": 4667.309132532129, + "learning_rate": 2.6412450291254564e-07, + "logits/chosen": -2.5879178047180176, + "logits/rejected": -2.5420918464660645, + "logps/chosen": -259.8382263183594, + "logps/rejected": -201.00717163085938, + "loss": 459.5845, + "rewards/accuracies": 0.5, + "rewards/chosen": 31.532434463500977, + "rewards/margins": -8.552940368652344, + "rewards/rejected": 40.08536911010742, + "step": 2720 + }, + { + "epoch": 0.535819430814524, + "grad_norm": 4748.499403267251, + "learning_rate": 2.6241394775271954e-07, + "logits/chosen": -2.5561461448669434, + "logits/rejected": -2.4587976932525635, + "logps/chosen": -221.2384490966797, + "logps/rejected": -198.52723693847656, + "loss": 479.5985, + "rewards/accuracies": 0.4999999403953552, + "rewards/chosen": 39.287330627441406, + "rewards/margins": 8.734041213989258, + "rewards/rejected": 30.553293228149414, + "step": 2730 + }, + { + "epoch": 0.5377821393523062, + "grad_norm": 4533.5320344006905, + "learning_rate": 2.607028097798276e-07, + "logits/chosen": -2.515784740447998, + "logits/rejected": -2.4613964557647705, + "logps/chosen": -244.35098266601562, + "logps/rejected": -255.76876831054688, + "loss": 449.5435, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 41.61980438232422, + "rewards/margins": 4.5574846267700195, + "rewards/rejected": 37.06231689453125, + "step": 2740 + }, + { + "epoch": 0.5397448478900884, + "grad_norm": 4306.144339979795, + "learning_rate": 2.5899116932879534e-07, + "logits/chosen": -2.5324409008026123, + "logits/rejected": -2.4598164558410645, + "logps/chosen": -163.7893524169922, + "logps/rejected": -178.0715789794922, + "loss": 432.9026, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 36.10814666748047, + "rewards/margins": 5.678139686584473, + "rewards/rejected": 30.430007934570312, + "step": 2750 + }, + { + "epoch": 0.5417075564278705, + "grad_norm": 4113.54374124532, + "learning_rate": 2.5727910675813866e-07, + "logits/chosen": -2.435154438018799, + "logits/rejected": -2.542113780975342, + "logps/chosen": -212.41983032226562, + "logps/rejected": -228.49526977539062, + "loss": 470.3569, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 40.49172592163086, + "rewards/margins": -3.3083176612854004, + "rewards/rejected": 43.80004119873047, + "step": 2760 + }, + { + "epoch": 0.5436702649656526, + "grad_norm": 5349.584082641944, + "learning_rate": 2.555667024461915e-07, + "logits/chosen": -2.6058411598205566, + "logits/rejected": -2.6284537315368652, + "logps/chosen": -194.70742797851562, + "logps/rejected": -217.4590301513672, + "loss": 459.6099, + "rewards/accuracies": 0.5, + "rewards/chosen": 37.93566131591797, + "rewards/margins": -10.541536331176758, + "rewards/rejected": 48.477195739746094, + "step": 2770 + }, + { + "epoch": 0.5456329735034348, + "grad_norm": 4703.226204053684, + "learning_rate": 2.5385403678733157e-07, + "logits/chosen": -2.5422253608703613, + "logits/rejected": -2.514312505722046, + "logps/chosen": -158.90664672851562, + "logps/rejected": -199.33143615722656, + "loss": 419.9885, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 40.289405822753906, + "rewards/margins": 8.071573257446289, + "rewards/rejected": 32.21782684326172, + "step": 2780 + }, + { + "epoch": 0.5475956820412169, + "grad_norm": 3944.7102508190455, + "learning_rate": 2.521411901882067e-07, + "logits/chosen": -2.628328800201416, + "logits/rejected": -2.554342746734619, + "logps/chosen": -226.6717529296875, + "logps/rejected": -170.31246948242188, + "loss": 441.2466, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 36.1329231262207, + "rewards/margins": 0.4193221926689148, + "rewards/rejected": 35.713600158691406, + "step": 2790 + }, + { + "epoch": 0.549558390578999, + "grad_norm": 3885.3308770567787, + "learning_rate": 2.504282430639594e-07, + "logits/chosen": -2.6190319061279297, + "logits/rejected": -2.5678019523620605, + "logps/chosen": -174.28733825683594, + "logps/rejected": -176.26931762695312, + "loss": 442.4453, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 33.87175369262695, + "rewards/margins": 5.558090686798096, + "rewards/rejected": 28.31365966796875, + "step": 2800 + }, + { + "epoch": 0.5515210991167812, + "grad_norm": 5218.016929324244, + "learning_rate": 2.4871527583445163e-07, + "logits/chosen": -2.665811538696289, + "logits/rejected": -2.6195101737976074, + "logps/chosen": -276.04541015625, + "logps/rejected": -239.3740234375, + "loss": 430.998, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 33.38459014892578, + "rewards/margins": -0.7618095278739929, + "rewards/rejected": 34.14639663696289, + "step": 2810 + }, + { + "epoch": 0.5534838076545633, + "grad_norm": 4109.24398892843, + "learning_rate": 2.470023689204893e-07, + "logits/chosen": -2.6391077041625977, + "logits/rejected": -2.6396422386169434, + "logps/chosen": -252.8202362060547, + "logps/rejected": -239.5262451171875, + "loss": 420.4567, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 41.7357177734375, + "rewards/margins": 1.3734490871429443, + "rewards/rejected": 40.362266540527344, + "step": 2820 + }, + { + "epoch": 0.5554465161923454, + "grad_norm": 4059.341902081399, + "learning_rate": 2.452896027400465e-07, + "logits/chosen": -2.72487211227417, + "logits/rejected": -2.6438424587249756, + "logps/chosen": -234.49044799804688, + "logps/rejected": -237.9053192138672, + "loss": 470.4413, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 46.0778923034668, + "rewards/margins": 1.6294276714324951, + "rewards/rejected": 44.448463439941406, + "step": 2830 + }, + { + "epoch": 0.5574092247301276, + "grad_norm": 5062.330068814192, + "learning_rate": 2.4357705770449046e-07, + "logits/chosen": -2.505913257598877, + "logits/rejected": -2.5338852405548096, + "logps/chosen": -215.6706085205078, + "logps/rejected": -204.14747619628906, + "loss": 421.4331, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 34.1905632019043, + "rewards/margins": 0.7007572054862976, + "rewards/rejected": 33.48980712890625, + "step": 2840 + }, + { + "epoch": 0.5593719332679097, + "grad_norm": 4383.801364110385, + "learning_rate": 2.418648142148056e-07, + "logits/chosen": -2.4966864585876465, + "logits/rejected": -2.526427984237671, + "logps/chosen": -246.0478515625, + "logps/rejected": -179.22616577148438, + "loss": 456.8376, + "rewards/accuracies": 0.4999999403953552, + "rewards/chosen": 37.7618293762207, + "rewards/margins": -0.15580138564109802, + "rewards/rejected": 37.91763687133789, + "step": 2850 + }, + { + "epoch": 0.5613346418056918, + "grad_norm": 3793.43585256964, + "learning_rate": 2.4015295265781966e-07, + "logits/chosen": -2.387354850769043, + "logits/rejected": -2.3449103832244873, + "logps/chosen": -254.6068878173828, + "logps/rejected": -263.32269287109375, + "loss": 454.3401, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 43.64141845703125, + "rewards/margins": -1.6728923320770264, + "rewards/rejected": 45.31431198120117, + "step": 2860 + }, + { + "epoch": 0.563297350343474, + "grad_norm": 4384.572825733482, + "learning_rate": 2.3844155340242893e-07, + "logits/chosen": -2.5381031036376953, + "logits/rejected": -2.506338596343994, + "logps/chosen": -176.04165649414062, + "logps/rejected": -180.90724182128906, + "loss": 395.7169, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 40.10258102416992, + "rewards/margins": 2.170680522918701, + "rewards/rejected": 37.931907653808594, + "step": 2870 + }, + { + "epoch": 0.5652600588812562, + "grad_norm": 6019.272847984498, + "learning_rate": 2.36730696795826e-07, + "logits/chosen": -2.772923469543457, + "logits/rejected": -2.754718065261841, + "logps/chosen": -205.79281616210938, + "logps/rejected": -298.2911071777344, + "loss": 514.6727, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 34.41475296020508, + "rewards/margins": -3.1085028648376465, + "rewards/rejected": 37.52325439453125, + "step": 2880 + }, + { + "epoch": 0.5672227674190383, + "grad_norm": 4282.95508081811, + "learning_rate": 2.3502046315972655e-07, + "logits/chosen": -2.614781141281128, + "logits/rejected": -2.548280715942383, + "logps/chosen": -272.5713195800781, + "logps/rejected": -253.4302520751953, + "loss": 470.3598, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 46.17670440673828, + "rewards/margins": 5.820972442626953, + "rewards/rejected": 40.355735778808594, + "step": 2890 + }, + { + "epoch": 0.5691854759568205, + "grad_norm": 4430.8877672860535, + "learning_rate": 2.3331093278659906e-07, + "logits/chosen": -2.634147882461548, + "logits/rejected": -2.641078472137451, + "logps/chosen": -258.8018798828125, + "logps/rejected": -241.8573760986328, + "loss": 461.3288, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 49.79001235961914, + "rewards/margins": 7.490835666656494, + "rewards/rejected": 42.29917526245117, + "step": 2900 + }, + { + "epoch": 0.5711481844946026, + "grad_norm": 4962.723571464093, + "learning_rate": 2.31602185935895e-07, + "logits/chosen": -2.749091386795044, + "logits/rejected": -2.6512725353240967, + "logps/chosen": -248.509033203125, + "logps/rejected": -194.28793334960938, + "loss": 441.0518, + "rewards/accuracies": 0.5, + "rewards/chosen": 35.29741668701172, + "rewards/margins": 6.335047721862793, + "rewards/rejected": 28.96236801147461, + "step": 2910 + }, + { + "epoch": 0.5731108930323847, + "grad_norm": 4110.409170118379, + "learning_rate": 2.298943028302811e-07, + "logits/chosen": -2.7444474697113037, + "logits/rejected": -2.6776154041290283, + "logps/chosen": -255.79269409179688, + "logps/rejected": -257.0079650878906, + "loss": 463.8671, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 33.684104919433594, + "rewards/margins": -20.564167022705078, + "rewards/rejected": 54.24827194213867, + "step": 2920 + }, + { + "epoch": 0.5750736015701668, + "grad_norm": 3637.573540955409, + "learning_rate": 2.2818736365187242e-07, + "logits/chosen": -2.6612980365753174, + "logits/rejected": -2.625981330871582, + "logps/chosen": -194.0688934326172, + "logps/rejected": -154.44122314453125, + "loss": 375.4742, + "rewards/accuracies": 0.5, + "rewards/chosen": 33.018577575683594, + "rewards/margins": -4.529515266418457, + "rewards/rejected": 37.548099517822266, + "step": 2930 + }, + { + "epoch": 0.577036310107949, + "grad_norm": 4580.753048080811, + "learning_rate": 2.2648144853846847e-07, + "logits/chosen": -2.6012885570526123, + "logits/rejected": -2.56108021736145, + "logps/chosen": -231.09231567382812, + "logps/rejected": -221.6013641357422, + "loss": 468.9104, + "rewards/accuracies": 0.5, + "rewards/chosen": 28.090499877929688, + "rewards/margins": -15.818713188171387, + "rewards/rejected": 43.909210205078125, + "step": 2940 + }, + { + "epoch": 0.5789990186457311, + "grad_norm": 4972.307690153018, + "learning_rate": 2.247766375797906e-07, + "logits/chosen": -2.586660861968994, + "logits/rejected": -2.606600522994995, + "logps/chosen": -156.38302612304688, + "logps/rejected": -165.44100952148438, + "loss": 493.1104, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 35.012245178222656, + "rewards/margins": -4.334465980529785, + "rewards/rejected": 39.34671401977539, + "step": 2950 + }, + { + "epoch": 0.5809617271835132, + "grad_norm": 3767.4283177101634, + "learning_rate": 2.2307301081372222e-07, + "logits/chosen": -2.544673204421997, + "logits/rejected": -2.5878868103027344, + "logps/chosen": -215.91079711914062, + "logps/rejected": -239.40243530273438, + "loss": 379.2808, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 41.98701858520508, + "rewards/margins": 5.5673136711120605, + "rewards/rejected": 36.41970443725586, + "step": 2960 + }, + { + "epoch": 0.5829244357212954, + "grad_norm": 4987.044244288877, + "learning_rate": 2.2137064822255086e-07, + "logits/chosen": -2.6021924018859863, + "logits/rejected": -2.5253403186798096, + "logps/chosen": -181.9603729248047, + "logps/rejected": -176.57830810546875, + "loss": 393.5017, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 38.797855377197266, + "rewards/margins": -0.5924438238143921, + "rewards/rejected": 39.39030075073242, + "step": 2970 + }, + { + "epoch": 0.5848871442590775, + "grad_norm": 4962.65706927889, + "learning_rate": 2.1966962972921322e-07, + "logits/chosen": -2.6079602241516113, + "logits/rejected": -2.55991792678833, + "logps/chosen": -200.67337036132812, + "logps/rejected": -234.6151885986328, + "loss": 492.385, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 36.16741180419922, + "rewards/margins": 1.037660837173462, + "rewards/rejected": 35.12975311279297, + "step": 2980 + }, + { + "epoch": 0.5868498527968596, + "grad_norm": 5511.373165062106, + "learning_rate": 2.1797003519354285e-07, + "logits/chosen": -2.632431745529175, + "logits/rejected": -2.6114015579223633, + "logps/chosen": -224.2974090576172, + "logps/rejected": -224.2973175048828, + "loss": 458.8507, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 37.5731201171875, + "rewards/margins": -1.473892331123352, + "rewards/rejected": 39.04701614379883, + "step": 2990 + }, + { + "epoch": 0.5888125613346418, + "grad_norm": 5348.887293650911, + "learning_rate": 2.1627194440852142e-07, + "logits/chosen": -2.4771065711975098, + "logits/rejected": -2.5486245155334473, + "logps/chosen": -254.0469207763672, + "logps/rejected": -229.03182983398438, + "loss": 479.329, + "rewards/accuracies": 0.533333420753479, + "rewards/chosen": 39.728477478027344, + "rewards/margins": 10.570646286010742, + "rewards/rejected": 29.157833099365234, + "step": 3000 + }, + { + "epoch": 0.5907752698724239, + "grad_norm": 4409.821263587961, + "learning_rate": 2.1457543709653176e-07, + "logits/chosen": -2.642092227935791, + "logits/rejected": -2.6010630130767822, + "logps/chosen": -236.6316680908203, + "logps/rejected": -216.1038055419922, + "loss": 456.4789, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 37.16182327270508, + "rewards/margins": 1.520094633102417, + "rewards/rejected": 35.641727447509766, + "step": 3010 + }, + { + "epoch": 0.592737978410206, + "grad_norm": 4895.273800471361, + "learning_rate": 2.128805929056154e-07, + "logits/chosen": -2.605106830596924, + "logits/rejected": -2.6045784950256348, + "logps/chosen": -153.69691467285156, + "logps/rejected": -166.0969696044922, + "loss": 438.2885, + "rewards/accuracies": 0.5, + "rewards/chosen": 27.13759994506836, + "rewards/margins": -1.3463178873062134, + "rewards/rejected": 28.48392105102539, + "step": 3020 + }, + { + "epoch": 0.5947006869479883, + "grad_norm": 4532.808190246022, + "learning_rate": 2.1118749140573358e-07, + "logits/chosen": -2.6185317039489746, + "logits/rejected": -2.553497552871704, + "logps/chosen": -218.6675262451172, + "logps/rejected": -216.93447875976562, + "loss": 430.802, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 35.49135208129883, + "rewards/margins": 1.2050817012786865, + "rewards/rejected": 34.28627014160156, + "step": 3030 + }, + { + "epoch": 0.5966633954857704, + "grad_norm": 4602.001983182268, + "learning_rate": 2.0949621208503092e-07, + "logits/chosen": -2.5064640045166016, + "logits/rejected": -2.5191729068756104, + "logps/chosen": -250.8184356689453, + "logps/rejected": -188.92161560058594, + "loss": 459.5887, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 49.90302276611328, + "rewards/margins": 17.360546112060547, + "rewards/rejected": 32.54247283935547, + "step": 3040 + }, + { + "epoch": 0.5986261040235525, + "grad_norm": 3721.707073522762, + "learning_rate": 2.0780683434610413e-07, + "logits/chosen": -2.5521321296691895, + "logits/rejected": -2.5300755500793457, + "logps/chosen": -205.0784912109375, + "logps/rejected": -224.0399627685547, + "loss": 426.4348, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 35.36996841430664, + "rewards/margins": 4.673448085784912, + "rewards/rejected": 30.696521759033203, + "step": 3050 + }, + { + "epoch": 0.6005888125613347, + "grad_norm": 5192.749971428991, + "learning_rate": 2.0611943750227375e-07, + "logits/chosen": -2.5217807292938232, + "logits/rejected": -2.4791207313537598, + "logps/chosen": -219.89437866210938, + "logps/rejected": -198.4583282470703, + "loss": 433.7738, + "rewards/accuracies": 0.43333330750465393, + "rewards/chosen": 42.70241165161133, + "rewards/margins": -2.064373016357422, + "rewards/rejected": 44.76677703857422, + "step": 3060 + }, + { + "epoch": 0.6025515210991168, + "grad_norm": 5461.744696463223, + "learning_rate": 2.044341007738612e-07, + "logits/chosen": -2.6488800048828125, + "logits/rejected": -2.5429649353027344, + "logps/chosen": -296.5935363769531, + "logps/rejected": -253.7562255859375, + "loss": 420.5087, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": 54.2684326171875, + "rewards/margins": 10.449033737182617, + "rewards/rejected": 43.819400787353516, + "step": 3070 + }, + { + "epoch": 0.6045142296368989, + "grad_norm": 4070.9126786852544, + "learning_rate": 2.027509032844687e-07, + "logits/chosen": -2.783871650695801, + "logits/rejected": -2.8232202529907227, + "logps/chosen": -271.37237548828125, + "logps/rejected": -303.91717529296875, + "loss": 513.612, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 48.6921501159668, + "rewards/margins": 0.8832836151123047, + "rewards/rejected": 47.808868408203125, + "step": 3080 + }, + { + "epoch": 0.6064769381746811, + "grad_norm": 4759.790843261946, + "learning_rate": 2.010699240572651e-07, + "logits/chosen": -2.656386613845825, + "logits/rejected": -2.7077832221984863, + "logps/chosen": -314.8296203613281, + "logps/rejected": -276.51123046875, + "loss": 542.8778, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 44.121734619140625, + "rewards/margins": -4.4381608963012695, + "rewards/rejected": 48.559898376464844, + "step": 3090 + }, + { + "epoch": 0.6084396467124632, + "grad_norm": 4163.665532803295, + "learning_rate": 1.993912420112756e-07, + "logits/chosen": -2.493317127227783, + "logits/rejected": -2.449044704437256, + "logps/chosen": -256.8260192871094, + "logps/rejected": -284.9006042480469, + "loss": 460.4679, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 66.61991882324219, + "rewards/margins": -32.62569808959961, + "rewards/rejected": 99.24561309814453, + "step": 3100 + }, + { + "epoch": 0.6104023552502453, + "grad_norm": 11937.020377702016, + "learning_rate": 1.9771493595767707e-07, + "logits/chosen": -2.5441012382507324, + "logits/rejected": -2.5751733779907227, + "logps/chosen": -247.4366912841797, + "logps/rejected": -298.6747741699219, + "loss": 441.9069, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 38.246559143066406, + "rewards/margins": 10.147356986999512, + "rewards/rejected": 28.09920310974121, + "step": 3110 + }, + { + "epoch": 0.6123650637880275, + "grad_norm": 4595.713922683333, + "learning_rate": 1.9604108459609752e-07, + "logits/chosen": -2.616608142852783, + "logits/rejected": -2.595402240753174, + "logps/chosen": -271.9317932128906, + "logps/rejected": -279.56298828125, + "loss": 430.3151, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 47.826576232910156, + "rewards/margins": 9.75730037689209, + "rewards/rejected": 38.069278717041016, + "step": 3120 + }, + { + "epoch": 0.6143277723258096, + "grad_norm": 4013.64311777735, + "learning_rate": 1.9436976651092142e-07, + "logits/chosen": -2.5802206993103027, + "logits/rejected": -2.518826961517334, + "logps/chosen": -244.60006713867188, + "logps/rejected": -230.1786346435547, + "loss": 482.6275, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 64.09306335449219, + "rewards/margins": -5.149061679840088, + "rewards/rejected": 69.24212646484375, + "step": 3130 + }, + { + "epoch": 0.6162904808635917, + "grad_norm": 4316.587871257736, + "learning_rate": 1.9270106016760035e-07, + "logits/chosen": -2.676774263381958, + "logits/rejected": -2.6333038806915283, + "logps/chosen": -221.6033172607422, + "logps/rejected": -231.56314086914062, + "loss": 388.8536, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 42.179222106933594, + "rewards/margins": 0.9718021154403687, + "rewards/rejected": 41.207420349121094, + "step": 3140 + }, + { + "epoch": 0.6182531894013739, + "grad_norm": 4599.885600859548, + "learning_rate": 1.9103504390896944e-07, + "logits/chosen": -2.6174519062042236, + "logits/rejected": -2.568662643432617, + "logps/chosen": -190.2574462890625, + "logps/rejected": -243.7998046875, + "loss": 433.49, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 33.63319778442383, + "rewards/margins": -8.475696563720703, + "rewards/rejected": 42.10889434814453, + "step": 3150 + }, + { + "epoch": 0.620215897939156, + "grad_norm": 4708.710117073037, + "learning_rate": 1.8937179595156876e-07, + "logits/chosen": -2.6824817657470703, + "logits/rejected": -2.5376830101013184, + "logps/chosen": -236.5217742919922, + "logps/rejected": -173.81320190429688, + "loss": 472.0527, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 42.436920166015625, + "rewards/margins": 7.270151615142822, + "rewards/rejected": 35.166770935058594, + "step": 3160 + }, + { + "epoch": 0.6221786064769381, + "grad_norm": 5239.948622278977, + "learning_rate": 1.8771139438197168e-07, + "logits/chosen": -2.6610610485076904, + "logits/rejected": -2.5386433601379395, + "logps/chosen": -254.28652954101562, + "logps/rejected": -267.49725341796875, + "loss": 515.909, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 37.86225509643555, + "rewards/margins": 5.88637638092041, + "rewards/rejected": 31.975879669189453, + "step": 3170 + }, + { + "epoch": 0.6241413150147204, + "grad_norm": 3644.418786384365, + "learning_rate": 1.8605391715311846e-07, + "logits/chosen": -2.4733784198760986, + "logits/rejected": -2.344364881515503, + "logps/chosen": -246.8455352783203, + "logps/rejected": -174.06283569335938, + "loss": 437.8695, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 38.20279312133789, + "rewards/margins": 5.049153804779053, + "rewards/rejected": 33.15364074707031, + "step": 3180 + }, + { + "epoch": 0.6261040235525025, + "grad_norm": 5441.213028983494, + "learning_rate": 1.8439944208065704e-07, + "logits/chosen": -2.672022819519043, + "logits/rejected": -2.6368296146392822, + "logps/chosen": -299.55938720703125, + "logps/rejected": -291.67816162109375, + "loss": 476.3895, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": 50.05424880981445, + "rewards/margins": 15.180732727050781, + "rewards/rejected": 34.8735237121582, + "step": 3190 + }, + { + "epoch": 0.6280667320902846, + "grad_norm": 4261.357128597478, + "learning_rate": 1.8274804683928913e-07, + "logits/chosen": -2.6394951343536377, + "logits/rejected": -2.5230634212493896, + "logps/chosen": -291.575439453125, + "logps/rejected": -241.4493865966797, + "loss": 480.2888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 41.28780746459961, + "rewards/margins": -0.9671966433525085, + "rewards/rejected": 42.2550048828125, + "step": 3200 + }, + { + "epoch": 0.6300294406280668, + "grad_norm": 4478.453528554573, + "learning_rate": 1.810998089591238e-07, + "logits/chosen": -2.676175117492676, + "logits/rejected": -2.6173923015594482, + "logps/chosen": -212.4101104736328, + "logps/rejected": -218.7488250732422, + "loss": 445.4913, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 40.66368865966797, + "rewards/margins": 0.6559812426567078, + "rewards/rejected": 40.00770950317383, + "step": 3210 + }, + { + "epoch": 0.6319921491658489, + "grad_norm": 4539.001330624233, + "learning_rate": 1.7945480582203745e-07, + "logits/chosen": -2.58876371383667, + "logits/rejected": -2.568467378616333, + "logps/chosen": -198.20535278320312, + "logps/rejected": -231.5351104736328, + "loss": 403.5555, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 50.637123107910156, + "rewards/margins": 14.340398788452148, + "rewards/rejected": 36.296722412109375, + "step": 3220 + }, + { + "epoch": 0.633954857703631, + "grad_norm": 4529.697661512652, + "learning_rate": 1.7781311465804128e-07, + "logits/chosen": -2.485405445098877, + "logits/rejected": -2.449179172515869, + "logps/chosen": -214.7068634033203, + "logps/rejected": -192.54421997070312, + "loss": 431.5271, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 64.39295196533203, + "rewards/margins": 23.37224006652832, + "rewards/rejected": 41.020721435546875, + "step": 3230 + }, + { + "epoch": 0.6359175662414132, + "grad_norm": 5177.31263434082, + "learning_rate": 1.7617481254165487e-07, + "logits/chosen": -2.5449860095977783, + "logits/rejected": -2.557673931121826, + "logps/chosen": -207.82638549804688, + "logps/rejected": -189.16799926757812, + "loss": 477.9846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 63.777488708496094, + "rewards/margins": 30.085861206054688, + "rewards/rejected": 33.69163131713867, + "step": 3240 + }, + { + "epoch": 0.6378802747791953, + "grad_norm": 4353.921471179031, + "learning_rate": 1.745399763882881e-07, + "logits/chosen": -2.573568344116211, + "logits/rejected": -2.520761013031006, + "logps/chosen": -253.19186401367188, + "logps/rejected": -242.68246459960938, + "loss": 397.9687, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 36.56116485595703, + "rewards/margins": 2.457200527191162, + "rewards/rejected": 34.10395812988281, + "step": 3250 + }, + { + "epoch": 0.6398429833169774, + "grad_norm": 4828.868713875561, + "learning_rate": 1.7290868295062983e-07, + "logits/chosen": -2.4604735374450684, + "logits/rejected": -2.5235681533813477, + "logps/chosen": -242.060302734375, + "logps/rejected": -246.6624298095703, + "loss": 452.2631, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 45.33163070678711, + "rewards/margins": 7.067657470703125, + "rewards/rejected": 38.263973236083984, + "step": 3260 + }, + { + "epoch": 0.6418056918547596, + "grad_norm": 5303.634850928562, + "learning_rate": 1.7128100881504492e-07, + "logits/chosen": -2.565659999847412, + "logits/rejected": -2.437257766723633, + "logps/chosen": -241.627685546875, + "logps/rejected": -187.8494415283203, + "loss": 508.0383, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": 44.94586181640625, + "rewards/margins": 15.400263786315918, + "rewards/rejected": 29.54559898376465, + "step": 3270 + }, + { + "epoch": 0.6437684003925417, + "grad_norm": 4748.199539584447, + "learning_rate": 1.6965703039797808e-07, + "logits/chosen": -2.548427104949951, + "logits/rejected": -2.4462082386016846, + "logps/chosen": -266.9375, + "logps/rejected": -184.31228637695312, + "loss": 455.7013, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 48.49503707885742, + "rewards/margins": 5.851919651031494, + "rewards/rejected": 42.64311981201172, + "step": 3280 + }, + { + "epoch": 0.6457311089303238, + "grad_norm": 5059.711030740259, + "learning_rate": 1.6803682394236656e-07, + "logits/chosen": -2.7422773838043213, + "logits/rejected": -2.6419894695281982, + "logps/chosen": -285.46710205078125, + "logps/rejected": -216.86337280273438, + "loss": 411.9077, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": 46.86991500854492, + "rewards/margins": -5.114217281341553, + "rewards/rejected": 51.984130859375, + "step": 3290 + }, + { + "epoch": 0.647693817468106, + "grad_norm": 4717.050316429819, + "learning_rate": 1.664204655140607e-07, + "logits/chosen": -2.5475409030914307, + "logits/rejected": -2.5353424549102783, + "logps/chosen": -196.85162353515625, + "logps/rejected": -225.6861114501953, + "loss": 469.1195, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 40.43767547607422, + "rewards/margins": -12.499992370605469, + "rewards/rejected": 52.93767166137695, + "step": 3300 + }, + { + "epoch": 0.6496565260058881, + "grad_norm": 4539.935608470393, + "learning_rate": 1.6480803099825277e-07, + "logits/chosen": -2.6134755611419678, + "logits/rejected": -2.5636157989501953, + "logps/chosen": -215.1249237060547, + "logps/rejected": -166.5946807861328, + "loss": 435.5142, + "rewards/accuracies": 0.5, + "rewards/chosen": 46.504249572753906, + "rewards/margins": -8.098384857177734, + "rewards/rejected": 54.602638244628906, + "step": 3310 + }, + { + "epoch": 0.6516192345436702, + "grad_norm": 4318.11502712315, + "learning_rate": 1.6319959609591412e-07, + "logits/chosen": -2.493683338165283, + "logits/rejected": -2.4112517833709717, + "logps/chosen": -193.81930541992188, + "logps/rejected": -158.67062377929688, + "loss": 411.9584, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 48.755897521972656, + "rewards/margins": 11.440227508544922, + "rewards/rejected": 37.31566619873047, + "step": 3320 + }, + { + "epoch": 0.6535819430814525, + "grad_norm": 4180.583847225108, + "learning_rate": 1.6159523632024126e-07, + "logits/chosen": -2.603215456008911, + "logits/rejected": -2.5020575523376465, + "logps/chosen": -246.5648956298828, + "logps/rejected": -274.041259765625, + "loss": 465.5974, + "rewards/accuracies": 0.5, + "rewards/chosen": 42.838783264160156, + "rewards/margins": 1.3022468090057373, + "rewards/rejected": 41.53654098510742, + "step": 3330 + }, + { + "epoch": 0.6555446516192346, + "grad_norm": 5364.492058913141, + "learning_rate": 1.599950269931107e-07, + "logits/chosen": -2.4481101036071777, + "logits/rejected": -2.4633541107177734, + "logps/chosen": -259.56243896484375, + "logps/rejected": -212.51803588867188, + "loss": 471.3864, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 36.67453384399414, + "rewards/margins": -3.0609450340270996, + "rewards/rejected": 39.73548126220703, + "step": 3340 + }, + { + "epoch": 0.6575073601570167, + "grad_norm": 3839.3877086108932, + "learning_rate": 1.5839904324154273e-07, + "logits/chosen": -2.592939853668213, + "logits/rejected": -2.455808401107788, + "logps/chosen": -243.414306640625, + "logps/rejected": -233.156982421875, + "loss": 460.8823, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 41.609745025634766, + "rewards/margins": -20.40592384338379, + "rewards/rejected": 62.01567459106445, + "step": 3350 + }, + { + "epoch": 0.6594700686947988, + "grad_norm": 4412.835972775599, + "learning_rate": 1.568073599941742e-07, + "logits/chosen": -2.6051926612854004, + "logits/rejected": -2.6242880821228027, + "logps/chosen": -260.4896545410156, + "logps/rejected": -234.35903930664062, + "loss": 470.7374, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 38.179622650146484, + "rewards/margins": -1.027712345123291, + "rewards/rejected": 39.20732879638672, + "step": 3360 + }, + { + "epoch": 0.661432777232581, + "grad_norm": 3617.8577010382865, + "learning_rate": 1.552200519777408e-07, + "logits/chosen": -2.6957767009735107, + "logits/rejected": -2.5345678329467773, + "logps/chosen": -273.77520751953125, + "logps/rejected": -219.6286163330078, + "loss": 490.3991, + "rewards/accuracies": 0.5, + "rewards/chosen": 44.90713882446289, + "rewards/margins": 16.124292373657227, + "rewards/rejected": 28.7828426361084, + "step": 3370 + }, + { + "epoch": 0.6633954857703631, + "grad_norm": 4019.619244739591, + "learning_rate": 1.5363719371356882e-07, + "logits/chosen": -2.7795815467834473, + "logits/rejected": -2.727719306945801, + "logps/chosen": -290.29443359375, + "logps/rejected": -190.9752960205078, + "loss": 368.1448, + "rewards/accuracies": 0.5, + "rewards/chosen": 50.465457916259766, + "rewards/margins": 13.8067626953125, + "rewards/rejected": 36.658695220947266, + "step": 3380 + }, + { + "epoch": 0.6653581943081452, + "grad_norm": 4725.070966876188, + "learning_rate": 1.5205885951407665e-07, + "logits/chosen": -2.6008598804473877, + "logits/rejected": -2.5928776264190674, + "logps/chosen": -214.3909454345703, + "logps/rejected": -253.8872833251953, + "loss": 430.3905, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 39.41809844970703, + "rewards/margins": -9.465060234069824, + "rewards/rejected": 48.883155822753906, + "step": 3390 + }, + { + "epoch": 0.6673209028459274, + "grad_norm": 5097.329526473776, + "learning_rate": 1.5048512347928564e-07, + "logits/chosen": -2.5902061462402344, + "logits/rejected": -2.38393497467041, + "logps/chosen": -249.7587432861328, + "logps/rejected": -171.68746948242188, + "loss": 440.4719, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 38.5333366394043, + "rewards/margins": 7.863332271575928, + "rewards/rejected": 30.67000389099121, + "step": 3400 + }, + { + "epoch": 0.6692836113837095, + "grad_norm": 4547.883065698954, + "learning_rate": 1.4891605949334133e-07, + "logits/chosen": -2.750488042831421, + "logits/rejected": -2.6084415912628174, + "logps/chosen": -425.6971130371094, + "logps/rejected": -345.8495788574219, + "loss": 519.4268, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": 47.112815856933594, + "rewards/margins": 13.372627258300781, + "rewards/rejected": 33.74019241333008, + "step": 3410 + }, + { + "epoch": 0.6712463199214916, + "grad_norm": 5017.880275091979, + "learning_rate": 1.4735174122104476e-07, + "logits/chosen": -2.5117251873016357, + "logits/rejected": -2.4606595039367676, + "logps/chosen": -201.7481231689453, + "logps/rejected": -164.49876403808594, + "loss": 430.2011, + "rewards/accuracies": 0.6333332657814026, + "rewards/chosen": 37.651527404785156, + "rewards/margins": 3.23347544670105, + "rewards/rejected": 34.41805648803711, + "step": 3420 + }, + { + "epoch": 0.6732090284592738, + "grad_norm": 4525.0237736900335, + "learning_rate": 1.457922421043943e-07, + "logits/chosen": -2.6487231254577637, + "logits/rejected": -2.4554855823516846, + "logps/chosen": -283.5890197753906, + "logps/rejected": -169.73692321777344, + "loss": 429.0944, + "rewards/accuracies": 0.5, + "rewards/chosen": 36.453189849853516, + "rewards/margins": 0.8102760314941406, + "rewards/rejected": 35.642913818359375, + "step": 3430 + }, + { + "epoch": 0.6751717369970559, + "grad_norm": 4836.351887681175, + "learning_rate": 1.4423763535913704e-07, + "logits/chosen": -2.7184793949127197, + "logits/rejected": -2.66823148727417, + "logps/chosen": -220.481201171875, + "logps/rejected": -233.3111572265625, + "loss": 444.5051, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 46.962947845458984, + "rewards/margins": 12.238215446472168, + "rewards/rejected": 34.724735260009766, + "step": 3440 + }, + { + "epoch": 0.677134445534838, + "grad_norm": 5097.525411981749, + "learning_rate": 1.426879939713322e-07, + "logits/chosen": -2.6069436073303223, + "logits/rejected": -2.58504581451416, + "logps/chosen": -262.5929870605469, + "logps/rejected": -197.80320739746094, + "loss": 414.4149, + "rewards/accuracies": 0.5, + "rewards/chosen": 41.25318908691406, + "rewards/margins": 3.419710874557495, + "rewards/rejected": 37.83348083496094, + "step": 3450 + }, + { + "epoch": 0.6790971540726202, + "grad_norm": 3986.6147298360115, + "learning_rate": 1.4114339069392374e-07, + "logits/chosen": -2.6958091259002686, + "logits/rejected": -2.527467727661133, + "logps/chosen": -251.56552124023438, + "logps/rejected": -173.15176391601562, + "loss": 406.3061, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 45.1236686706543, + "rewards/margins": 6.652157783508301, + "rewards/rejected": 38.47150802612305, + "step": 3460 + }, + { + "epoch": 0.6810598626104023, + "grad_norm": 4532.603335528347, + "learning_rate": 1.3960389804332556e-07, + "logits/chosen": -2.560441493988037, + "logits/rejected": -2.5246596336364746, + "logps/chosen": -216.03915405273438, + "logps/rejected": -249.4767303466797, + "loss": 432.2447, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 41.10346221923828, + "rewards/margins": -1.274405837059021, + "rewards/rejected": 42.377864837646484, + "step": 3470 + }, + { + "epoch": 0.6830225711481845, + "grad_norm": 4453.197624373022, + "learning_rate": 1.380695882960165e-07, + "logits/chosen": -2.591632843017578, + "logits/rejected": -2.5592544078826904, + "logps/chosen": -243.0457763671875, + "logps/rejected": -180.34759521484375, + "loss": 409.3893, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 29.93951988220215, + "rewards/margins": -16.706111907958984, + "rewards/rejected": 46.6456298828125, + "step": 3480 + }, + { + "epoch": 0.6849852796859667, + "grad_norm": 4587.090606395604, + "learning_rate": 1.3654053348514702e-07, + "logits/chosen": -2.3719887733459473, + "logits/rejected": -2.284217357635498, + "logps/chosen": -130.6105499267578, + "logps/rejected": -172.13636779785156, + "loss": 469.2424, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 33.91435623168945, + "rewards/margins": 4.379640102386475, + "rewards/rejected": 29.534717559814453, + "step": 3490 + }, + { + "epoch": 0.6869479882237488, + "grad_norm": 4560.865072217876, + "learning_rate": 1.350168053971577e-07, + "logits/chosen": -2.571834087371826, + "logits/rejected": -2.5446648597717285, + "logps/chosen": -318.7664794921875, + "logps/rejected": -205.4440460205078, + "loss": 483.0706, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 43.74156951904297, + "rewards/margins": 9.468842506408691, + "rewards/rejected": 34.272727966308594, + "step": 3500 + }, + { + "epoch": 0.6889106967615309, + "grad_norm": 4127.48376874728, + "learning_rate": 1.3349847556840876e-07, + "logits/chosen": -2.6035072803497314, + "logits/rejected": -2.621722936630249, + "logps/chosen": -200.86965942382812, + "logps/rejected": -229.7669219970703, + "loss": 503.4723, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 44.48680877685547, + "rewards/margins": 7.581006050109863, + "rewards/rejected": 36.905799865722656, + "step": 3510 + }, + { + "epoch": 0.6908734052993131, + "grad_norm": 4659.940196560349, + "learning_rate": 1.3198561528182182e-07, + "logits/chosen": -2.5462212562561035, + "logits/rejected": -2.5860719680786133, + "logps/chosen": -170.47500610351562, + "logps/rejected": -178.5261688232422, + "loss": 484.3155, + "rewards/accuracies": 0.36666667461395264, + "rewards/chosen": 28.57163429260254, + "rewards/margins": -16.579164505004883, + "rewards/rejected": 45.15079879760742, + "step": 3520 + }, + { + "epoch": 0.6928361138370952, + "grad_norm": 5071.0735487629045, + "learning_rate": 1.3047829556353263e-07, + "logits/chosen": -2.592815399169922, + "logits/rejected": -2.55568265914917, + "logps/chosen": -216.5419921875, + "logps/rejected": -212.7433624267578, + "loss": 467.819, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 36.28511428833008, + "rewards/margins": 9.000925064086914, + "rewards/rejected": 27.2841854095459, + "step": 3530 + }, + { + "epoch": 0.6947988223748773, + "grad_norm": 4320.72890254297, + "learning_rate": 1.2897658717955742e-07, + "logits/chosen": -2.4630746841430664, + "logits/rejected": -2.3922531604766846, + "logps/chosen": -222.0006866455078, + "logps/rejected": -181.97515869140625, + "loss": 387.9535, + "rewards/accuracies": 0.43333330750465393, + "rewards/chosen": 40.32611083984375, + "rewards/margins": 8.769170761108398, + "rewards/rejected": 31.556941986083984, + "step": 3540 + }, + { + "epoch": 0.6967615309126595, + "grad_norm": 4726.8806674800735, + "learning_rate": 1.2748056063246994e-07, + "logits/chosen": -2.6976161003112793, + "logits/rejected": -2.6608502864837646, + "logps/chosen": -262.3450622558594, + "logps/rejected": -241.70654296875, + "loss": 478.4425, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 39.3624153137207, + "rewards/margins": -8.320429801940918, + "rewards/rejected": 47.68284225463867, + "step": 3550 + }, + { + "epoch": 0.6987242394504416, + "grad_norm": 4601.9229330035905, + "learning_rate": 1.2599028615809183e-07, + "logits/chosen": -2.582568645477295, + "logits/rejected": -2.5583627223968506, + "logps/chosen": -257.66522216796875, + "logps/rejected": -201.1621856689453, + "loss": 453.1743, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 39.114158630371094, + "rewards/margins": 1.6954081058502197, + "rewards/rejected": 37.41875457763672, + "step": 3560 + }, + { + "epoch": 0.7006869479882237, + "grad_norm": 3547.0105030627583, + "learning_rate": 1.2450583372219458e-07, + "logits/chosen": -2.5113353729248047, + "logits/rejected": -2.5333168506622314, + "logps/chosen": -252.20834350585938, + "logps/rejected": -246.343017578125, + "loss": 418.4648, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 50.306793212890625, + "rewards/margins": 1.4378515481948853, + "rewards/rejected": 48.86893844604492, + "step": 3570 + }, + { + "epoch": 0.7026496565260059, + "grad_norm": 4122.336208978104, + "learning_rate": 1.230272730172157e-07, + "logits/chosen": -2.4826693534851074, + "logits/rejected": -2.4991822242736816, + "logps/chosen": -253.10952758789062, + "logps/rejected": -263.3420104980469, + "loss": 407.0652, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 43.307411193847656, + "rewards/margins": -1.0030360221862793, + "rewards/rejected": 44.310447692871094, + "step": 3580 + }, + { + "epoch": 0.704612365063788, + "grad_norm": 4848.095885022227, + "learning_rate": 1.2155467345898602e-07, + "logits/chosen": -2.607501745223999, + "logits/rejected": -2.520416736602783, + "logps/chosen": -218.1444854736328, + "logps/rejected": -248.6970977783203, + "loss": 443.058, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 49.31399917602539, + "rewards/margins": -15.503260612487793, + "rewards/rejected": 64.81726837158203, + "step": 3590 + }, + { + "epoch": 0.7065750736015701, + "grad_norm": 3891.8257985300042, + "learning_rate": 1.2008810418347093e-07, + "logits/chosen": -2.5468451976776123, + "logits/rejected": -2.646713972091675, + "logps/chosen": -171.98916625976562, + "logps/rejected": -169.75364685058594, + "loss": 354.3096, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 29.069055557250977, + "rewards/margins": -12.974533081054688, + "rewards/rejected": 42.04358673095703, + "step": 3600 + }, + { + "epoch": 0.7085377821393523, + "grad_norm": 4447.461803676008, + "learning_rate": 1.1862763404352483e-07, + "logits/chosen": -2.733118772506714, + "logits/rejected": -2.6008687019348145, + "logps/chosen": -274.53125, + "logps/rejected": -239.507080078125, + "loss": 506.1706, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 39.546661376953125, + "rewards/margins": -0.24837017059326172, + "rewards/rejected": 39.7950325012207, + "step": 3610 + }, + { + "epoch": 0.7105004906771345, + "grad_norm": 4505.5930871988285, + "learning_rate": 1.1717333160565807e-07, + "logits/chosen": -2.662598133087158, + "logits/rejected": -2.610295057296753, + "logps/chosen": -303.64556884765625, + "logps/rejected": -236.7455596923828, + "loss": 454.2638, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 54.06207275390625, + "rewards/margins": 8.460227966308594, + "rewards/rejected": 45.601844787597656, + "step": 3620 + }, + { + "epoch": 0.7124631992149166, + "grad_norm": 5463.823656137664, + "learning_rate": 1.1572526514681874e-07, + "logits/chosen": -2.6133432388305664, + "logits/rejected": -2.5437867641448975, + "logps/chosen": -260.36273193359375, + "logps/rejected": -290.7547912597656, + "loss": 455.4876, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 37.65696716308594, + "rewards/margins": 0.33991679549217224, + "rewards/rejected": 37.31705093383789, + "step": 3630 + }, + { + "epoch": 0.7144259077526988, + "grad_norm": 4995.263176189, + "learning_rate": 1.1428350265118613e-07, + "logits/chosen": -2.666706085205078, + "logits/rejected": -2.551975727081299, + "logps/chosen": -272.3623352050781, + "logps/rejected": -243.81796264648438, + "loss": 491.338, + "rewards/accuracies": 0.533333420753479, + "rewards/chosen": 44.81690979003906, + "rewards/margins": 11.616350173950195, + "rewards/rejected": 33.20055389404297, + "step": 3640 + }, + { + "epoch": 0.7163886162904809, + "grad_norm": 6107.795181914702, + "learning_rate": 1.128481118069799e-07, + "logits/chosen": -2.6353912353515625, + "logits/rejected": -2.429758071899414, + "logps/chosen": -231.792724609375, + "logps/rejected": -225.7418212890625, + "loss": 478.819, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 39.23735427856445, + "rewards/margins": -10.274882316589355, + "rewards/rejected": 49.51223373413086, + "step": 3650 + }, + { + "epoch": 0.718351324828263, + "grad_norm": 4559.000157082686, + "learning_rate": 1.114191600032815e-07, + "logits/chosen": -2.705068588256836, + "logits/rejected": -2.5441765785217285, + "logps/chosen": -266.17327880859375, + "logps/rejected": -232.7379913330078, + "loss": 478.844, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 45.4412727355957, + "rewards/margins": 4.956995964050293, + "rewards/rejected": 40.484275817871094, + "step": 3660 + }, + { + "epoch": 0.7203140333660452, + "grad_norm": 5142.283096753777, + "learning_rate": 1.0999671432687099e-07, + "logits/chosen": -2.5555663108825684, + "logits/rejected": -2.3798253536224365, + "logps/chosen": -250.02890014648438, + "logps/rejected": -199.82859802246094, + "loss": 469.3981, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 48.90443801879883, + "rewards/margins": 12.09408187866211, + "rewards/rejected": 36.810359954833984, + "step": 3670 + }, + { + "epoch": 0.7222767419038273, + "grad_norm": 4831.377411388173, + "learning_rate": 1.085808415590772e-07, + "logits/chosen": -2.70011568069458, + "logits/rejected": -2.653996706008911, + "logps/chosen": -252.80184936523438, + "logps/rejected": -220.48556518554688, + "loss": 441.9232, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 32.957645416259766, + "rewards/margins": 5.683412551879883, + "rewards/rejected": 27.27423667907715, + "step": 3680 + }, + { + "epoch": 0.7242394504416094, + "grad_norm": 4974.359444411199, + "learning_rate": 1.0717160817264217e-07, + "logits/chosen": -2.6733148097991943, + "logits/rejected": -2.5146970748901367, + "logps/chosen": -226.50149536132812, + "logps/rejected": -233.2989501953125, + "loss": 423.3871, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 45.977073669433594, + "rewards/margins": 10.666391372680664, + "rewards/rejected": 35.31068801879883, + "step": 3690 + }, + { + "epoch": 0.7262021589793916, + "grad_norm": 4246.788329339762, + "learning_rate": 1.0576908032860088e-07, + "logits/chosen": -2.3461179733276367, + "logits/rejected": -2.367692232131958, + "logps/chosen": -205.265625, + "logps/rejected": -166.40811157226562, + "loss": 433.289, + "rewards/accuracies": 0.4999999403953552, + "rewards/chosen": 51.241355895996094, + "rewards/margins": 10.636072158813477, + "rewards/rejected": 40.605281829833984, + "step": 3700 + }, + { + "epoch": 0.7281648675171737, + "grad_norm": 4238.6510414008535, + "learning_rate": 1.0437332387317474e-07, + "logits/chosen": -2.6798744201660156, + "logits/rejected": -2.5828280448913574, + "logps/chosen": -207.62948608398438, + "logps/rejected": -175.34597778320312, + "loss": 449.38, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 37.80424118041992, + "rewards/margins": 1.2310230731964111, + "rewards/rejected": 36.573219299316406, + "step": 3710 + }, + { + "epoch": 0.7301275760549558, + "grad_norm": 4840.256252230794, + "learning_rate": 1.0298440433468048e-07, + "logits/chosen": -2.762472152709961, + "logits/rejected": -2.674773693084717, + "logps/chosen": -292.7893981933594, + "logps/rejected": -214.03201293945312, + "loss": 504.2667, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 44.73220443725586, + "rewards/margins": 6.6243391036987305, + "rewards/rejected": 38.107872009277344, + "step": 3720 + }, + { + "epoch": 0.732090284592738, + "grad_norm": 5142.153148669025, + "learning_rate": 1.0160238692045331e-07, + "logits/chosen": -2.6255710124969482, + "logits/rejected": -2.4991328716278076, + "logps/chosen": -219.4676513671875, + "logps/rejected": -178.04421997070312, + "loss": 418.81, + "rewards/accuracies": 0.5, + "rewards/chosen": 31.921016693115234, + "rewards/margins": 1.1468164920806885, + "rewards/rejected": 30.774200439453125, + "step": 3730 + }, + { + "epoch": 0.7340529931305201, + "grad_norm": 5349.043883670735, + "learning_rate": 1.0022733651378606e-07, + "logits/chosen": -2.691039562225342, + "logits/rejected": -2.532325506210327, + "logps/chosen": -333.5316467285156, + "logps/rejected": -235.3372344970703, + "loss": 483.2089, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 39.4155387878418, + "rewards/margins": 4.746474266052246, + "rewards/rejected": 34.6690673828125, + "step": 3740 + }, + { + "epoch": 0.7360157016683022, + "grad_norm": 5552.097807426382, + "learning_rate": 9.88593176708827e-08, + "logits/chosen": -2.5714547634124756, + "logits/rejected": -2.592780351638794, + "logps/chosen": -232.8053741455078, + "logps/rejected": -243.07565307617188, + "loss": 453.0571, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 42.47258758544922, + "rewards/margins": -0.20083312690258026, + "rewards/rejected": 42.673423767089844, + "step": 3750 + }, + { + "epoch": 0.7379784102060843, + "grad_norm": 3840.3442356230516, + "learning_rate": 9.749839461782769e-08, + "logits/chosen": -2.7019336223602295, + "logits/rejected": -2.7575888633728027, + "logps/chosen": -218.07241821289062, + "logps/rejected": -260.0151062011719, + "loss": 420.3206, + "rewards/accuracies": 0.5, + "rewards/chosen": 31.354604721069336, + "rewards/margins": -0.9220180511474609, + "rewards/rejected": 32.2766227722168, + "step": 3760 + }, + { + "epoch": 0.7399411187438666, + "grad_norm": 4322.842993427434, + "learning_rate": 9.614463124757041e-08, + "logits/chosen": -2.4091198444366455, + "logits/rejected": -2.4457297325134277, + "logps/chosen": -200.60665893554688, + "logps/rejected": -193.24002075195312, + "loss": 400.7776, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 35.54182434082031, + "rewards/margins": -4.2541184425354, + "rewards/rejected": 39.795936584472656, + "step": 3770 + }, + { + "epoch": 0.7419038272816487, + "grad_norm": 4109.254624443096, + "learning_rate": 9.479809111692586e-08, + "logits/chosen": -2.6434237957000732, + "logits/rejected": -2.640014171600342, + "logps/chosen": -197.17636108398438, + "logps/rejected": -240.40560913085938, + "loss": 468.0661, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 37.981597900390625, + "rewards/margins": -3.1270596981048584, + "rewards/rejected": 41.10865783691406, + "step": 3780 + }, + { + "epoch": 0.7438665358194309, + "grad_norm": 2855.689201059151, + "learning_rate": 9.345883744359065e-08, + "logits/chosen": -2.5281224250793457, + "logits/rejected": -2.5895755290985107, + "logps/chosen": -235.78555297851562, + "logps/rejected": -295.47418212890625, + "loss": 443.861, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 34.26362991333008, + "rewards/margins": -6.375296115875244, + "rewards/rejected": 40.63892364501953, + "step": 3790 + }, + { + "epoch": 0.745829244357213, + "grad_norm": 4095.0063036058204, + "learning_rate": 9.212693310317479e-08, + "logits/chosen": -2.6409642696380615, + "logits/rejected": -2.624389171600342, + "logps/chosen": -219.658935546875, + "logps/rejected": -201.77801513671875, + "loss": 351.3882, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 34.03952407836914, + "rewards/margins": -6.243267059326172, + "rewards/rejected": 40.28278732299805, + "step": 3800 + }, + { + "epoch": 0.7477919528949951, + "grad_norm": 5539.3807407061995, + "learning_rate": 9.08024406262503e-08, + "logits/chosen": -2.657045841217041, + "logits/rejected": -2.575908660888672, + "logps/chosen": -203.8868408203125, + "logps/rejected": -216.73587036132812, + "loss": 441.1558, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 40.998573303222656, + "rewards/margins": 7.906311988830566, + "rewards/rejected": 33.092262268066406, + "step": 3810 + }, + { + "epoch": 0.7497546614327772, + "grad_norm": 4983.020657353442, + "learning_rate": 8.94854221954148e-08, + "logits/chosen": -2.6159934997558594, + "logits/rejected": -2.5689492225646973, + "logps/chosen": -196.431884765625, + "logps/rejected": -162.80032348632812, + "loss": 436.0953, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 37.00093078613281, + "rewards/margins": -5.000515937805176, + "rewards/rejected": 42.00144577026367, + "step": 3820 + }, + { + "epoch": 0.7517173699705594, + "grad_norm": 4490.380890972323, + "learning_rate": 8.817593964237316e-08, + "logits/chosen": -2.6067864894866943, + "logits/rejected": -2.5778086185455322, + "logps/chosen": -240.83688354492188, + "logps/rejected": -200.7977294921875, + "loss": 475.9042, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 38.47137451171875, + "rewards/margins": -4.470655918121338, + "rewards/rejected": 42.9420280456543, + "step": 3830 + }, + { + "epoch": 0.7536800785083415, + "grad_norm": 4077.9532632758896, + "learning_rate": 8.68740544450334e-08, + "logits/chosen": -2.733462333679199, + "logits/rejected": -2.5932741165161133, + "logps/chosen": -307.20343017578125, + "logps/rejected": -230.4998016357422, + "loss": 459.9514, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 53.13201904296875, + "rewards/margins": 15.279828071594238, + "rewards/rejected": 37.8521842956543, + "step": 3840 + }, + { + "epoch": 0.7556427870461236, + "grad_norm": 4075.7974134105666, + "learning_rate": 8.557982772462138e-08, + "logits/chosen": -2.468815803527832, + "logits/rejected": -2.475836992263794, + "logps/chosen": -220.2480926513672, + "logps/rejected": -208.54409790039062, + "loss": 431.1687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 42.63874816894531, + "rewards/margins": -0.08769845962524414, + "rewards/rejected": 42.7264518737793, + "step": 3850 + }, + { + "epoch": 0.7576054955839058, + "grad_norm": 4183.992966358521, + "learning_rate": 8.429332024281088e-08, + "logits/chosen": -2.636207103729248, + "logits/rejected": -2.5418641567230225, + "logps/chosen": -255.40493774414062, + "logps/rejected": -189.78768920898438, + "loss": 432.5977, + "rewards/accuracies": 0.5, + "rewards/chosen": 30.501134872436523, + "rewards/margins": 3.4080607891082764, + "rewards/rejected": 27.09307289123535, + "step": 3860 + }, + { + "epoch": 0.7595682041216879, + "grad_norm": 4995.9974039380695, + "learning_rate": 8.301459239887073e-08, + "logits/chosen": -2.7311673164367676, + "logits/rejected": -2.5933837890625, + "logps/chosen": -290.5824279785156, + "logps/rejected": -235.07888793945312, + "loss": 500.8695, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 49.280540466308594, + "rewards/margins": 18.33820915222168, + "rewards/rejected": 30.942337036132812, + "step": 3870 + }, + { + "epoch": 0.76153091265947, + "grad_norm": 3567.3108992513685, + "learning_rate": 8.17437042268298e-08, + "logits/chosen": -2.6642038822174072, + "logits/rejected": -2.669226884841919, + "logps/chosen": -245.01382446289062, + "logps/rejected": -263.2218933105469, + "loss": 428.3638, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 43.7030029296875, + "rewards/margins": 0.9183242917060852, + "rewards/rejected": 42.7846794128418, + "step": 3880 + }, + { + "epoch": 0.7634936211972522, + "grad_norm": 4561.994494662221, + "learning_rate": 8.048071539265761e-08, + "logits/chosen": -2.6854119300842285, + "logits/rejected": -2.4938607215881348, + "logps/chosen": -265.8202819824219, + "logps/rejected": -190.0291290283203, + "loss": 484.5575, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 48.63257598876953, + "rewards/margins": 7.2778825759887695, + "rewards/rejected": 41.354698181152344, + "step": 3890 + }, + { + "epoch": 0.7654563297350343, + "grad_norm": 4900.414267620809, + "learning_rate": 7.922568519146425e-08, + "logits/chosen": -2.370140552520752, + "logits/rejected": -2.4470481872558594, + "logps/chosen": -196.44146728515625, + "logps/rejected": -162.85464477539062, + "loss": 392.1866, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 30.315418243408203, + "rewards/margins": -11.973292350769043, + "rewards/rejected": 42.2887077331543, + "step": 3900 + }, + { + "epoch": 0.7674190382728164, + "grad_norm": 4856.0820375206495, + "learning_rate": 7.79786725447154e-08, + "logits/chosen": -2.5297694206237793, + "logits/rejected": -2.509547710418701, + "logps/chosen": -224.86044311523438, + "logps/rejected": -189.3321990966797, + "loss": 462.1615, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 35.0362663269043, + "rewards/margins": 8.711492538452148, + "rewards/rejected": 26.32477378845215, + "step": 3910 + }, + { + "epoch": 0.7693817468105987, + "grad_norm": 5365.9583135093335, + "learning_rate": 7.6739735997467e-08, + "logits/chosen": -2.6649789810180664, + "logits/rejected": -2.6618189811706543, + "logps/chosen": -259.0755920410156, + "logps/rejected": -222.2447967529297, + "loss": 490.3834, + "rewards/accuracies": 0.533333420753479, + "rewards/chosen": 38.72358322143555, + "rewards/margins": 0.7858904600143433, + "rewards/rejected": 37.93769073486328, + "step": 3920 + }, + { + "epoch": 0.7713444553483808, + "grad_norm": 4335.707899099495, + "learning_rate": 7.550893371561593e-08, + "logits/chosen": -2.37510085105896, + "logits/rejected": -2.476245641708374, + "logps/chosen": -218.3855438232422, + "logps/rejected": -196.65638732910156, + "loss": 448.4311, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 49.959930419921875, + "rewards/margins": 1.878343939781189, + "rewards/rejected": 48.08158493041992, + "step": 3930 + }, + { + "epoch": 0.7733071638861629, + "grad_norm": 4130.993241850778, + "learning_rate": 7.428632348317004e-08, + "logits/chosen": -2.62736439704895, + "logits/rejected": -2.5467922687530518, + "logps/chosen": -207.3795928955078, + "logps/rejected": -221.605224609375, + "loss": 447.8396, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 41.028438568115234, + "rewards/margins": 4.151480674743652, + "rewards/rejected": 36.876956939697266, + "step": 3940 + }, + { + "epoch": 0.7752698724239451, + "grad_norm": 3780.1009018502455, + "learning_rate": 7.307196269953444e-08, + "logits/chosen": -2.7356576919555664, + "logits/rejected": -2.649294376373291, + "logps/chosen": -247.86917114257812, + "logps/rejected": -226.4387664794922, + "loss": 399.9371, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 35.226959228515625, + "rewards/margins": -1.1159265041351318, + "rewards/rejected": 36.34288787841797, + "step": 3950 + }, + { + "epoch": 0.7772325809617272, + "grad_norm": 4994.717832707031, + "learning_rate": 7.186590837681732e-08, + "logits/chosen": -2.6508007049560547, + "logits/rejected": -2.543595790863037, + "logps/chosen": -225.8889617919922, + "logps/rejected": -170.00404357910156, + "loss": 446.4747, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 36.95011901855469, + "rewards/margins": -0.2716609835624695, + "rewards/rejected": 37.221778869628906, + "step": 3960 + }, + { + "epoch": 0.7791952894995093, + "grad_norm": 5242.481124911569, + "learning_rate": 7.066821713715293e-08, + "logits/chosen": -2.682112455368042, + "logits/rejected": -2.6149463653564453, + "logps/chosen": -276.6802673339844, + "logps/rejected": -244.07644653320312, + "loss": 500.1734, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 38.208160400390625, + "rewards/margins": -3.544785976409912, + "rewards/rejected": 41.75294876098633, + "step": 3970 + }, + { + "epoch": 0.7811579980372915, + "grad_norm": 5223.772181995805, + "learning_rate": 6.947894521004357e-08, + "logits/chosen": -2.6946938037872314, + "logits/rejected": -2.719026565551758, + "logps/chosen": -241.08273315429688, + "logps/rejected": -248.2490692138672, + "loss": 500.5159, + "rewards/accuracies": 0.4999999403953552, + "rewards/chosen": 40.993896484375, + "rewards/margins": -2.9638912677764893, + "rewards/rejected": 43.95779037475586, + "step": 3980 + }, + { + "epoch": 0.7831207065750736, + "grad_norm": 4163.850774595611, + "learning_rate": 6.829814842971965e-08, + "logits/chosen": -2.660637617111206, + "logits/rejected": -2.658723831176758, + "logps/chosen": -191.04733276367188, + "logps/rejected": -218.947998046875, + "loss": 470.5708, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 35.07811737060547, + "rewards/margins": 6.308375358581543, + "rewards/rejected": 28.769744873046875, + "step": 3990 + }, + { + "epoch": 0.7850834151128557, + "grad_norm": 5725.875365218493, + "learning_rate": 6.712588223251809e-08, + "logits/chosen": -2.7059569358825684, + "logits/rejected": -2.6640262603759766, + "logps/chosen": -298.4080505371094, + "logps/rejected": -242.0406951904297, + "loss": 471.646, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 42.22529220581055, + "rewards/margins": 4.421542167663574, + "rewards/rejected": 37.80375289916992, + "step": 4000 + }, + { + "epoch": 0.7870461236506379, + "grad_norm": 5064.624511268707, + "learning_rate": 6.596220165428002e-08, + "logits/chosen": -2.5640180110931396, + "logits/rejected": -2.560215711593628, + "logps/chosen": -211.7909393310547, + "logps/rejected": -205.9022674560547, + "loss": 443.7035, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 37.227325439453125, + "rewards/margins": -4.98318338394165, + "rewards/rejected": 42.210506439208984, + "step": 4010 + }, + { + "epoch": 0.78900883218842, + "grad_norm": 3850.4450567632475, + "learning_rate": 6.48071613277669e-08, + "logits/chosen": -2.599118709564209, + "logits/rejected": -2.493950366973877, + "logps/chosen": -205.70864868164062, + "logps/rejected": -222.24453735351562, + "loss": 469.0605, + "rewards/accuracies": 0.36666667461395264, + "rewards/chosen": 34.287513732910156, + "rewards/margins": -11.613555908203125, + "rewards/rejected": 45.90106964111328, + "step": 4020 + }, + { + "epoch": 0.7909715407262021, + "grad_norm": 4806.151283034705, + "learning_rate": 6.366081548009553e-08, + "logits/chosen": -2.5998551845550537, + "logits/rejected": -2.6141610145568848, + "logps/chosen": -228.11703491210938, + "logps/rejected": -225.1660614013672, + "loss": 489.8256, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 38.82665252685547, + "rewards/margins": 2.988865852355957, + "rewards/rejected": 35.83778762817383, + "step": 4030 + }, + { + "epoch": 0.7929342492639843, + "grad_norm": 4670.543637279515, + "learning_rate": 6.252321793019192e-08, + "logits/chosen": -2.5964105129241943, + "logits/rejected": -2.6201224327087402, + "logps/chosen": -196.1589813232422, + "logps/rejected": -220.24972534179688, + "loss": 429.3503, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 35.577186584472656, + "rewards/margins": 8.084705352783203, + "rewards/rejected": 27.492481231689453, + "step": 4040 + }, + { + "epoch": 0.7948969578017664, + "grad_norm": 3538.7537563169826, + "learning_rate": 6.139442208626517e-08, + "logits/chosen": -2.5182323455810547, + "logits/rejected": -2.5223388671875, + "logps/chosen": -161.74742126464844, + "logps/rejected": -129.5215301513672, + "loss": 416.8181, + "rewards/accuracies": 0.5, + "rewards/chosen": 30.123971939086914, + "rewards/margins": -0.24544867873191833, + "rewards/rejected": 30.369421005249023, + "step": 4050 + }, + { + "epoch": 0.7968596663395485, + "grad_norm": 4220.376873142191, + "learning_rate": 6.027448094329963e-08, + "logits/chosen": -2.7116899490356445, + "logits/rejected": -2.700085163116455, + "logps/chosen": -206.6305694580078, + "logps/rejected": -232.31869506835938, + "loss": 439.5108, + "rewards/accuracies": 0.36666664481163025, + "rewards/chosen": 36.45801544189453, + "rewards/margins": -3.6942970752716064, + "rewards/rejected": 40.15230941772461, + "step": 4060 + }, + { + "epoch": 0.7988223748773308, + "grad_norm": 3914.973610657506, + "learning_rate": 5.916344708056681e-08, + "logits/chosen": -2.5923945903778076, + "logits/rejected": -2.6218533515930176, + "logps/chosen": -227.88473510742188, + "logps/rejected": -195.7071075439453, + "loss": 429.5933, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 41.29386520385742, + "rewards/margins": 4.6230788230896, + "rewards/rejected": 36.6707878112793, + "step": 4070 + }, + { + "epoch": 0.8007850834151129, + "grad_norm": 3745.173886850546, + "learning_rate": 5.8061372659157306e-08, + "logits/chosen": -2.581653118133545, + "logits/rejected": -2.5396599769592285, + "logps/chosen": -282.02325439453125, + "logps/rejected": -270.64324951171875, + "loss": 466.2438, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 68.39329528808594, + "rewards/margins": 10.069668769836426, + "rewards/rejected": 58.3236198425293, + "step": 4080 + }, + { + "epoch": 0.802747791952895, + "grad_norm": 4574.038017807836, + "learning_rate": 5.6968309419531376e-08, + "logits/chosen": -2.616150379180908, + "logits/rejected": -2.662601947784424, + "logps/chosen": -250.6794891357422, + "logps/rejected": -214.62893676757812, + "loss": 481.5961, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 38.4940071105957, + "rewards/margins": 1.1990143060684204, + "rewards/rejected": 37.29499053955078, + "step": 4090 + }, + { + "epoch": 0.8047105004906772, + "grad_norm": 4328.731519222315, + "learning_rate": 5.5884308679090525e-08, + "logits/chosen": -2.608259439468384, + "logits/rejected": -2.3745930194854736, + "logps/chosen": -205.98544311523438, + "logps/rejected": -161.15406799316406, + "loss": 461.3298, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 41.09679412841797, + "rewards/margins": 18.13766860961914, + "rewards/rejected": 22.95912742614746, + "step": 4100 + }, + { + "epoch": 0.8066732090284593, + "grad_norm": 4289.949289633817, + "learning_rate": 5.480942132976732e-08, + "logits/chosen": -2.6755638122558594, + "logits/rejected": -2.5002448558807373, + "logps/chosen": -275.1580505371094, + "logps/rejected": -160.10308837890625, + "loss": 446.8364, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 46.448463439941406, + "rewards/margins": 12.812484741210938, + "rewards/rejected": 33.635982513427734, + "step": 4110 + }, + { + "epoch": 0.8086359175662414, + "grad_norm": 4999.739123013394, + "learning_rate": 5.374369783563698e-08, + "logits/chosen": -2.601970911026001, + "logits/rejected": -2.5141968727111816, + "logps/chosen": -239.7788848876953, + "logps/rejected": -238.88851928710938, + "loss": 508.9154, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 39.377445220947266, + "rewards/margins": -1.0681085586547852, + "rewards/rejected": 40.44554901123047, + "step": 4120 + }, + { + "epoch": 0.8105986261040236, + "grad_norm": 4353.603961100224, + "learning_rate": 5.268718823054752e-08, + "logits/chosen": -2.607865810394287, + "logits/rejected": -2.518691062927246, + "logps/chosen": -210.3962860107422, + "logps/rejected": -214.9388427734375, + "loss": 448.6874, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 35.81767654418945, + "rewards/margins": 3.1919147968292236, + "rewards/rejected": 32.625755310058594, + "step": 4130 + }, + { + "epoch": 0.8125613346418057, + "grad_norm": 4615.118197694769, + "learning_rate": 5.1639942115771384e-08, + "logits/chosen": -2.5191102027893066, + "logits/rejected": -2.606433629989624, + "logps/chosen": -189.73458862304688, + "logps/rejected": -178.82321166992188, + "loss": 403.7612, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 43.21727752685547, + "rewards/margins": 4.633804798126221, + "rewards/rejected": 38.58347702026367, + "step": 4140 + }, + { + "epoch": 0.8145240431795878, + "grad_norm": 4841.387032958852, + "learning_rate": 5.060200865767605e-08, + "logits/chosen": -2.660099506378174, + "logits/rejected": -2.5387344360351562, + "logps/chosen": -327.4266357421875, + "logps/rejected": -261.6529541015625, + "loss": 442.6245, + "rewards/accuracies": 0.3333333134651184, + "rewards/chosen": 46.238677978515625, + "rewards/margins": 4.456854343414307, + "rewards/rejected": 41.781822204589844, + "step": 4150 + }, + { + "epoch": 0.81648675171737, + "grad_norm": 4311.46735416585, + "learning_rate": 4.957343658541632e-08, + "logits/chosen": -2.596989154815674, + "logits/rejected": -2.6006436347961426, + "logps/chosen": -197.25576782226562, + "logps/rejected": -235.4192657470703, + "loss": 460.46, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 34.012142181396484, + "rewards/margins": -0.21886949241161346, + "rewards/rejected": 34.23101043701172, + "step": 4160 + }, + { + "epoch": 0.8184494602551521, + "grad_norm": 4273.784732786409, + "learning_rate": 4.8554274188646215e-08, + "logits/chosen": -2.6031975746154785, + "logits/rejected": -2.52292537689209, + "logps/chosen": -231.68310546875, + "logps/rejected": -182.66123962402344, + "loss": 435.8417, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 28.487682342529297, + "rewards/margins": 0.8157535791397095, + "rewards/rejected": 27.67193031311035, + "step": 4170 + }, + { + "epoch": 0.8204121687929342, + "grad_norm": 4626.558302395106, + "learning_rate": 4.754456931525208e-08, + "logits/chosen": -2.4478862285614014, + "logits/rejected": -2.496283769607544, + "logps/chosen": -223.149658203125, + "logps/rejected": -206.23141479492188, + "loss": 458.4399, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 44.41162109375, + "rewards/margins": -19.236581802368164, + "rewards/rejected": 63.64820098876953, + "step": 4180 + }, + { + "epoch": 0.8223748773307163, + "grad_norm": 3431.928908752168, + "learning_rate": 4.654436936910622e-08, + "logits/chosen": -2.6450271606445312, + "logits/rejected": -2.577511787414551, + "logps/chosen": -263.02874755859375, + "logps/rejected": -201.28765869140625, + "loss": 424.2093, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 41.35602951049805, + "rewards/margins": -2.2303383350372314, + "rewards/rejected": 43.58637237548828, + "step": 4190 + }, + { + "epoch": 0.8243375858684985, + "grad_norm": 5915.329560813934, + "learning_rate": 4.555372130784102e-08, + "logits/chosen": -2.689040184020996, + "logits/rejected": -2.6475412845611572, + "logps/chosen": -340.04925537109375, + "logps/rejected": -252.8019561767578, + "loss": 533.694, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 40.53813552856445, + "rewards/margins": -6.165125846862793, + "rewards/rejected": 46.7032585144043, + "step": 4200 + }, + { + "epoch": 0.8263002944062807, + "grad_norm": 4737.886824886873, + "learning_rate": 4.45726716406449e-08, + "logits/chosen": -2.697927474975586, + "logits/rejected": -2.719137668609619, + "logps/chosen": -273.2940368652344, + "logps/rejected": -216.5937957763672, + "loss": 451.9255, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 35.69445037841797, + "rewards/margins": -6.094300270080566, + "rewards/rejected": 41.78874588012695, + "step": 4210 + }, + { + "epoch": 0.8282630029440629, + "grad_norm": 4647.421613466152, + "learning_rate": 4.360126642607842e-08, + "logits/chosen": -2.5642268657684326, + "logits/rejected": -2.4418981075286865, + "logps/chosen": -289.02105712890625, + "logps/rejected": -204.9908905029297, + "loss": 440.6149, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 41.383323669433594, + "rewards/margins": 8.476794242858887, + "rewards/rejected": 32.906532287597656, + "step": 4220 + }, + { + "epoch": 0.830225711481845, + "grad_norm": 4716.893754956261, + "learning_rate": 4.2639551269912034e-08, + "logits/chosen": -2.5265350341796875, + "logits/rejected": -2.473881721496582, + "logps/chosen": -175.4093780517578, + "logps/rejected": -160.4100341796875, + "loss": 429.2075, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 33.56465530395508, + "rewards/margins": 2.8609957695007324, + "rewards/rejected": 30.703664779663086, + "step": 4230 + }, + { + "epoch": 0.8321884200196271, + "grad_norm": 4521.242810672261, + "learning_rate": 4.168757132298478e-08, + "logits/chosen": -2.666539192199707, + "logits/rejected": -2.5834553241729736, + "logps/chosen": -215.19424438476562, + "logps/rejected": -227.21981811523438, + "loss": 514.1854, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 43.63666915893555, + "rewards/margins": -5.732529640197754, + "rewards/rejected": 49.36920166015625, + "step": 4240 + }, + { + "epoch": 0.8341511285574092, + "grad_norm": 4505.593260548116, + "learning_rate": 4.0745371279084976e-08, + "logits/chosen": -2.647498607635498, + "logits/rejected": -2.5911407470703125, + "logps/chosen": -212.0909423828125, + "logps/rejected": -184.5307159423828, + "loss": 444.7489, + "rewards/accuracies": 0.5, + "rewards/chosen": 36.252803802490234, + "rewards/margins": -3.5416641235351562, + "rewards/rejected": 39.794471740722656, + "step": 4250 + }, + { + "epoch": 0.8361138370951914, + "grad_norm": 3860.498177175377, + "learning_rate": 3.9812995372851544e-08, + "logits/chosen": -2.5739986896514893, + "logits/rejected": -2.562840700149536, + "logps/chosen": -208.7525177001953, + "logps/rejected": -183.97872924804688, + "loss": 448.1645, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 38.437034606933594, + "rewards/margins": -2.179542303085327, + "rewards/rejected": 40.616580963134766, + "step": 4260 + }, + { + "epoch": 0.8380765456329735, + "grad_norm": 4504.214351649246, + "learning_rate": 3.8890487377697265e-08, + "logits/chosen": -2.665849208831787, + "logits/rejected": -2.6961982250213623, + "logps/chosen": -215.90518188476562, + "logps/rejected": -201.3123321533203, + "loss": 442.2289, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 38.20843505859375, + "rewards/margins": 1.6408487558364868, + "rewards/rejected": 36.567588806152344, + "step": 4270 + }, + { + "epoch": 0.8400392541707556, + "grad_norm": 5880.925064454791, + "learning_rate": 3.7977890603754e-08, + "logits/chosen": -2.633975028991699, + "logits/rejected": -2.5609302520751953, + "logps/chosen": -299.1169128417969, + "logps/rejected": -262.66351318359375, + "loss": 456.942, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 49.8904914855957, + "rewards/margins": -0.6626808047294617, + "rewards/rejected": 50.55316925048828, + "step": 4280 + }, + { + "epoch": 0.8420019627085378, + "grad_norm": 4398.542164694948, + "learning_rate": 3.707524789583891e-08, + "logits/chosen": -2.6659607887268066, + "logits/rejected": -2.5334270000457764, + "logps/chosen": -270.0296325683594, + "logps/rejected": -269.8255310058594, + "loss": 478.531, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 43.77894973754883, + "rewards/margins": 4.812531471252441, + "rewards/rejected": 38.96641540527344, + "step": 4290 + }, + { + "epoch": 0.8439646712463199, + "grad_norm": 4267.277513897122, + "learning_rate": 3.6182601631443596e-08, + "logits/chosen": -2.6800174713134766, + "logits/rejected": -2.6633589267730713, + "logps/chosen": -287.9276123046875, + "logps/rejected": -243.28076171875, + "loss": 483.4367, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 35.420494079589844, + "rewards/margins": 11.147326469421387, + "rewards/rejected": 24.27316665649414, + "step": 4300 + }, + { + "epoch": 0.845927379784102, + "grad_norm": 4431.88895323535, + "learning_rate": 3.529999371874381e-08, + "logits/chosen": -2.5969769954681396, + "logits/rejected": -2.5301458835601807, + "logps/chosen": -252.8112030029297, + "logps/rejected": -220.48355102539062, + "loss": 455.2943, + "rewards/accuracies": 0.40000003576278687, + "rewards/chosen": 28.724105834960938, + "rewards/margins": -13.333305358886719, + "rewards/rejected": 42.057411193847656, + "step": 4310 + }, + { + "epoch": 0.8478900883218842, + "grad_norm": 4783.508213649116, + "learning_rate": 3.4427465594632555e-08, + "logits/chosen": -2.436122179031372, + "logits/rejected": -2.3911919593811035, + "logps/chosen": -155.9526824951172, + "logps/rejected": -138.89137268066406, + "loss": 431.1864, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 44.76438903808594, + "rewards/margins": 5.638034820556641, + "rewards/rejected": 39.12635040283203, + "step": 4320 + }, + { + "epoch": 0.8498527968596663, + "grad_norm": 4884.956243665607, + "learning_rate": 3.356505822277417e-08, + "logits/chosen": -2.6550347805023193, + "logits/rejected": -2.5887913703918457, + "logps/chosen": -234.34707641601562, + "logps/rejected": -222.6831512451172, + "loss": 460.6974, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 35.740257263183594, + "rewards/margins": -4.165590763092041, + "rewards/rejected": 39.905845642089844, + "step": 4330 + }, + { + "epoch": 0.8518155053974484, + "grad_norm": 4860.935405417142, + "learning_rate": 3.271281209168186e-08, + "logits/chosen": -2.7008445262908936, + "logits/rejected": -2.528564929962158, + "logps/chosen": -237.4414825439453, + "logps/rejected": -188.55038452148438, + "loss": 401.3473, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 38.89970779418945, + "rewards/margins": 11.053295135498047, + "rewards/rejected": 27.846410751342773, + "step": 4340 + }, + { + "epoch": 0.8537782139352306, + "grad_norm": 6055.100620430209, + "learning_rate": 3.187076721281595e-08, + "logits/chosen": -2.66092848777771, + "logits/rejected": -2.5830471515655518, + "logps/chosen": -210.565673828125, + "logps/rejected": -207.7209930419922, + "loss": 435.1621, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 35.088157653808594, + "rewards/margins": 3.6022579669952393, + "rewards/rejected": 31.485897064208984, + "step": 4350 + }, + { + "epoch": 0.8557409224730128, + "grad_norm": 5425.634496450765, + "learning_rate": 3.1038963118706244e-08, + "logits/chosen": -2.4575374126434326, + "logits/rejected": -2.410790205001831, + "logps/chosen": -230.1165313720703, + "logps/rejected": -192.03001403808594, + "loss": 463.0259, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 47.27201461791992, + "rewards/margins": 6.697729587554932, + "rewards/rejected": 40.57428741455078, + "step": 4360 + }, + { + "epoch": 0.8577036310107949, + "grad_norm": 4776.448875246613, + "learning_rate": 3.0217438861095315e-08, + "logits/chosen": -2.524313449859619, + "logits/rejected": -2.5867514610290527, + "logps/chosen": -174.5723876953125, + "logps/rejected": -189.49002075195312, + "loss": 437.6378, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 31.9951114654541, + "rewards/margins": -0.5893153548240662, + "rewards/rejected": 32.58442687988281, + "step": 4370 + }, + { + "epoch": 0.8596663395485771, + "grad_norm": 4275.684848556367, + "learning_rate": 2.940623300910572e-08, + "logits/chosen": -2.7988975048065186, + "logits/rejected": -2.5053915977478027, + "logps/chosen": -271.8748474121094, + "logps/rejected": -174.35372924804688, + "loss": 423.6515, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": 47.99496841430664, + "rewards/margins": 20.64928436279297, + "rewards/rejected": 27.34568214416504, + "step": 4380 + }, + { + "epoch": 0.8616290480863592, + "grad_norm": 5322.318809597932, + "learning_rate": 2.860538364742898e-08, + "logits/chosen": -2.5552399158477783, + "logits/rejected": -2.506809711456299, + "logps/chosen": -298.73236083984375, + "logps/rejected": -188.0784454345703, + "loss": 482.5715, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 71.10126495361328, + "rewards/margins": 38.96298599243164, + "rewards/rejected": 32.138275146484375, + "step": 4390 + }, + { + "epoch": 0.8635917566241413, + "grad_norm": 4149.509052971397, + "learning_rate": 2.7814928374537334e-08, + "logits/chosen": -2.7223961353302, + "logits/rejected": -2.724046230316162, + "logps/chosen": -195.20144653320312, + "logps/rejected": -162.70327758789062, + "loss": 380.9954, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 30.957189559936523, + "rewards/margins": -9.33814811706543, + "rewards/rejected": 40.295345306396484, + "step": 4400 + }, + { + "epoch": 0.8655544651619235, + "grad_norm": 3713.7789351124247, + "learning_rate": 2.7034904300918982e-08, + "logits/chosen": -2.5335371494293213, + "logits/rejected": -2.6188912391662598, + "logps/chosen": -188.16238403320312, + "logps/rejected": -225.91171264648438, + "loss": 480.6724, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 33.5013313293457, + "rewards/margins": -3.412436008453369, + "rewards/rejected": 36.91376495361328, + "step": 4410 + }, + { + "epoch": 0.8675171736997056, + "grad_norm": 4032.6993198961527, + "learning_rate": 2.62653480473356e-08, + "logits/chosen": -2.7851080894470215, + "logits/rejected": -2.6982483863830566, + "logps/chosen": -227.9115753173828, + "logps/rejected": -208.1966094970703, + "loss": 455.1615, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 36.069908142089844, + "rewards/margins": -3.239365339279175, + "rewards/rejected": 39.30927276611328, + "step": 4420 + }, + { + "epoch": 0.8694798822374877, + "grad_norm": 4746.624165191648, + "learning_rate": 2.550629574310309e-08, + "logits/chosen": -2.5399880409240723, + "logits/rejected": -2.567885160446167, + "logps/chosen": -208.8949432373047, + "logps/rejected": -237.2500762939453, + "loss": 469.7333, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 30.57306480407715, + "rewards/margins": -15.53956413269043, + "rewards/rejected": 46.11262893676758, + "step": 4430 + }, + { + "epoch": 0.8714425907752699, + "grad_norm": 5491.387596164183, + "learning_rate": 2.475778302439524e-08, + "logits/chosen": -2.7679731845855713, + "logits/rejected": -2.642608404159546, + "logps/chosen": -276.5832824707031, + "logps/rejected": -206.662353515625, + "loss": 488.9821, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 40.070716857910156, + "rewards/margins": -2.739978790283203, + "rewards/rejected": 42.81069564819336, + "step": 4440 + }, + { + "epoch": 0.873405299313052, + "grad_norm": 4944.209462929502, + "learning_rate": 2.4019845032570875e-08, + "logits/chosen": -2.6773393154144287, + "logits/rejected": -2.6586387157440186, + "logps/chosen": -229.5032196044922, + "logps/rejected": -239.168701171875, + "loss": 458.2089, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 42.63987350463867, + "rewards/margins": 4.401918411254883, + "rewards/rejected": 38.237953186035156, + "step": 4450 + }, + { + "epoch": 0.8753680078508341, + "grad_norm": 3572.7898569854297, + "learning_rate": 2.3292516412524054e-08, + "logits/chosen": -2.7301645278930664, + "logits/rejected": -2.576932907104492, + "logps/chosen": -254.9808807373047, + "logps/rejected": -182.1911163330078, + "loss": 467.2688, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 39.77206802368164, + "rewards/margins": 2.0470340251922607, + "rewards/rejected": 37.725032806396484, + "step": 4460 + }, + { + "epoch": 0.8773307163886163, + "grad_norm": 4185.799065349416, + "learning_rate": 2.2575831311057225e-08, + "logits/chosen": -2.636169910430908, + "logits/rejected": -2.468846082687378, + "logps/chosen": -201.06875610351562, + "logps/rejected": -196.6493377685547, + "loss": 425.2623, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 32.289947509765625, + "rewards/margins": -4.733141899108887, + "rewards/rejected": 37.02309036254883, + "step": 4470 + }, + { + "epoch": 0.8792934249263984, + "grad_norm": 4820.737672747037, + "learning_rate": 2.1869823375278483e-08, + "logits/chosen": -2.442539691925049, + "logits/rejected": -2.250866651535034, + "logps/chosen": -166.69845581054688, + "logps/rejected": -153.46478271484375, + "loss": 423.7283, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 46.657470703125, + "rewards/margins": -1.7938625812530518, + "rewards/rejected": 48.451332092285156, + "step": 4480 + }, + { + "epoch": 0.8812561334641805, + "grad_norm": 4332.453219389595, + "learning_rate": 2.1174525751021578e-08, + "logits/chosen": -2.590590000152588, + "logits/rejected": -2.5887067317962646, + "logps/chosen": -225.1390838623047, + "logps/rejected": -217.708740234375, + "loss": 427.2968, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 46.83483123779297, + "rewards/margins": -6.553465366363525, + "rewards/rejected": 53.38829803466797, + "step": 4490 + }, + { + "epoch": 0.8832188420019627, + "grad_norm": 3521.570327513789, + "learning_rate": 2.0489971081290193e-08, + "logits/chosen": -2.6378543376922607, + "logits/rejected": -2.61759352684021, + "logps/chosen": -259.528564453125, + "logps/rejected": -198.2197723388672, + "loss": 462.5908, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 38.549556732177734, + "rewards/margins": -3.70062255859375, + "rewards/rejected": 42.25017547607422, + "step": 4500 + }, + { + "epoch": 0.8851815505397449, + "grad_norm": 4182.514142645229, + "learning_rate": 1.9816191504724826e-08, + "logits/chosen": -2.569251298904419, + "logits/rejected": -2.466890811920166, + "logps/chosen": -183.66378784179688, + "logps/rejected": -176.77728271484375, + "loss": 424.732, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 42.58602523803711, + "rewards/margins": 4.075484275817871, + "rewards/rejected": 38.51054382324219, + "step": 4510 + }, + { + "epoch": 0.887144259077527, + "grad_norm": 4074.2996752655677, + "learning_rate": 1.9153218654094498e-08, + "logits/chosen": -2.6386618614196777, + "logits/rejected": -2.592601776123047, + "logps/chosen": -229.7263946533203, + "logps/rejected": -193.13670349121094, + "loss": 447.1824, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 49.375267028808594, + "rewards/margins": 0.2786618173122406, + "rewards/rejected": 49.09660339355469, + "step": 4520 + }, + { + "epoch": 0.8891069676153092, + "grad_norm": 4530.355997455761, + "learning_rate": 1.8501083654811206e-08, + "logits/chosen": -2.5692012310028076, + "logits/rejected": -2.621366262435913, + "logps/chosen": -255.8880615234375, + "logps/rejected": -219.7906494140625, + "loss": 442.176, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 40.39422607421875, + "rewards/margins": -0.9515798687934875, + "rewards/rejected": 41.345802307128906, + "step": 4530 + }, + { + "epoch": 0.8910696761530913, + "grad_norm": 4684.779486705413, + "learning_rate": 1.7859817123469068e-08, + "logits/chosen": -2.453920602798462, + "logits/rejected": -2.5157649517059326, + "logps/chosen": -173.42568969726562, + "logps/rejected": -196.0359649658203, + "loss": 420.6044, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 27.611618041992188, + "rewards/margins": -5.926667213439941, + "rewards/rejected": 33.53828430175781, + "step": 4540 + }, + { + "epoch": 0.8930323846908734, + "grad_norm": 4064.0291223800077, + "learning_rate": 1.7229449166406477e-08, + "logits/chosen": -2.6352016925811768, + "logits/rejected": -2.5559489727020264, + "logps/chosen": -285.65545654296875, + "logps/rejected": -221.51974487304688, + "loss": 402.8019, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 44.281219482421875, + "rewards/margins": 3.11841082572937, + "rewards/rejected": 41.162811279296875, + "step": 4550 + }, + { + "epoch": 0.8949950932286556, + "grad_norm": 3899.6042633621073, + "learning_rate": 1.66100093782931e-08, + "logits/chosen": -2.4818520545959473, + "logits/rejected": -2.5240206718444824, + "logps/chosen": -224.8530731201172, + "logps/rejected": -245.0667724609375, + "loss": 452.7596, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 36.947601318359375, + "rewards/margins": -9.113560676574707, + "rewards/rejected": 46.061161041259766, + "step": 4560 + }, + { + "epoch": 0.8969578017664377, + "grad_norm": 4679.961925387276, + "learning_rate": 1.600152684074005e-08, + "logits/chosen": -2.539313793182373, + "logits/rejected": -2.57034969329834, + "logps/chosen": -271.5174560546875, + "logps/rejected": -281.6427307128906, + "loss": 476.7667, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 51.80295944213867, + "rewards/margins": 16.400400161743164, + "rewards/rejected": 35.402565002441406, + "step": 4570 + }, + { + "epoch": 0.8989205103042198, + "grad_norm": 4429.10974153184, + "learning_rate": 1.540403012093483e-08, + "logits/chosen": -2.6423587799072266, + "logits/rejected": -2.542809009552002, + "logps/chosen": -274.2030029296875, + "logps/rejected": -211.15005493164062, + "loss": 437.5022, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 33.952781677246094, + "rewards/margins": 5.86837911605835, + "rewards/rejected": 28.08440589904785, + "step": 4580 + }, + { + "epoch": 0.900883218842002, + "grad_norm": 4900.034915916676, + "learning_rate": 1.4817547270300185e-08, + "logits/chosen": -2.635451316833496, + "logits/rejected": -2.694579601287842, + "logps/chosen": -224.94287109375, + "logps/rejected": -294.5521545410156, + "loss": 489.1139, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 42.881195068359375, + "rewards/margins": -13.004231452941895, + "rewards/rejected": 55.88542556762695, + "step": 4590 + }, + { + "epoch": 0.9028459273797841, + "grad_norm": 4415.368652937512, + "learning_rate": 1.4242105823176837e-08, + "logits/chosen": -2.620007038116455, + "logits/rejected": -2.5084280967712402, + "logps/chosen": -281.3882141113281, + "logps/rejected": -214.0166778564453, + "loss": 431.6098, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 38.51653289794922, + "rewards/margins": -2.2388863563537598, + "rewards/rejected": 40.75541687011719, + "step": 4600 + }, + { + "epoch": 0.9048086359175662, + "grad_norm": 4673.780083482767, + "learning_rate": 1.3677732795531083e-08, + "logits/chosen": -2.5132038593292236, + "logits/rejected": -2.558148145675659, + "logps/chosen": -225.7850341796875, + "logps/rejected": -277.06292724609375, + "loss": 447.9788, + "rewards/accuracies": 0.5, + "rewards/chosen": 38.36613082885742, + "rewards/margins": 1.1070663928985596, + "rewards/rejected": 37.259063720703125, + "step": 4610 + }, + { + "epoch": 0.9067713444553483, + "grad_norm": 4751.787329471743, + "learning_rate": 1.3124454683686364e-08, + "logits/chosen": -2.538525104522705, + "logits/rejected": -2.559837818145752, + "logps/chosen": -207.3318634033203, + "logps/rejected": -222.19656372070312, + "loss": 390.7619, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 42.52751922607422, + "rewards/margins": 1.1009085178375244, + "rewards/rejected": 41.426612854003906, + "step": 4620 + }, + { + "epoch": 0.9087340529931305, + "grad_norm": 3330.4433365907444, + "learning_rate": 1.2582297463079288e-08, + "logits/chosen": -2.664224147796631, + "logits/rejected": -2.48795223236084, + "logps/chosen": -185.2596435546875, + "logps/rejected": -117.39766693115234, + "loss": 416.2551, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 43.808128356933594, + "rewards/margins": 6.2645745277404785, + "rewards/rejected": 37.54355239868164, + "step": 4630 + }, + { + "epoch": 0.9106967615309126, + "grad_norm": 4153.6041400108625, + "learning_rate": 1.2051286587040049e-08, + "logits/chosen": -2.583369731903076, + "logits/rejected": -2.554609537124634, + "logps/chosen": -235.30203247070312, + "logps/rejected": -237.036865234375, + "loss": 435.5568, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 45.061485290527344, + "rewards/margins": 5.400920867919922, + "rewards/rejected": 39.660560607910156, + "step": 4640 + }, + { + "epoch": 0.9126594700686947, + "grad_norm": 4977.865699530452, + "learning_rate": 1.1531446985597604e-08, + "logits/chosen": -2.6574277877807617, + "logits/rejected": -2.655367851257324, + "logps/chosen": -331.62701416015625, + "logps/rejected": -245.23562622070312, + "loss": 503.0266, + "rewards/accuracies": 0.5, + "rewards/chosen": 43.575950622558594, + "rewards/margins": 2.5033040046691895, + "rewards/rejected": 41.07265090942383, + "step": 4650 + }, + { + "epoch": 0.914622178606477, + "grad_norm": 4767.861821528587, + "learning_rate": 1.1022803064309194e-08, + "logits/chosen": -2.5727593898773193, + "logits/rejected": -2.439497470855713, + "logps/chosen": -246.31069946289062, + "logps/rejected": -274.4377746582031, + "loss": 460.7343, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 71.24130249023438, + "rewards/margins": -35.16503143310547, + "rewards/rejected": 106.40632629394531, + "step": 4660 + }, + { + "epoch": 0.9165848871442591, + "grad_norm": 4339.937447004007, + "learning_rate": 1.0525378703114401e-08, + "logits/chosen": -2.7839035987854004, + "logits/rejected": -2.6208622455596924, + "logps/chosen": -171.77706909179688, + "logps/rejected": -163.43350219726562, + "loss": 412.53, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 39.18108367919922, + "rewards/margins": 12.46143627166748, + "rewards/rejected": 26.719646453857422, + "step": 4670 + }, + { + "epoch": 0.9185475956820413, + "grad_norm": 5194.796246618481, + "learning_rate": 1.0039197255214238e-08, + "logits/chosen": -2.656510829925537, + "logits/rejected": -2.70479154586792, + "logps/chosen": -157.28660583496094, + "logps/rejected": -191.18362426757812, + "loss": 462.9357, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 30.98434066772461, + "rewards/margins": -9.275310516357422, + "rewards/rejected": 40.25965118408203, + "step": 4680 + }, + { + "epoch": 0.9205103042198234, + "grad_norm": 4776.008954221483, + "learning_rate": 9.564281545974661e-09, + "logits/chosen": -2.669079303741455, + "logits/rejected": -2.7157466411590576, + "logps/chosen": -225.6437225341797, + "logps/rejected": -216.80502319335938, + "loss": 406.8918, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 32.2359504699707, + "rewards/margins": -0.8312788009643555, + "rewards/rejected": 33.067230224609375, + "step": 4690 + }, + { + "epoch": 0.9224730127576055, + "grad_norm": 4908.2657214789815, + "learning_rate": 9.100653871854963e-09, + "logits/chosen": -2.679262399673462, + "logits/rejected": -2.778007984161377, + "logps/chosen": -262.40435791015625, + "logps/rejected": -246.1509246826172, + "loss": 441.9648, + "rewards/accuracies": 0.3999999761581421, + "rewards/chosen": 33.58713150024414, + "rewards/margins": -9.374418258666992, + "rewards/rejected": 42.961551666259766, + "step": 4700 + }, + { + "epoch": 0.9244357212953876, + "grad_norm": 4390.917872822957, + "learning_rate": 8.648335999360934e-09, + "logits/chosen": -2.6005899906158447, + "logits/rejected": -2.47259783744812, + "logps/chosen": -230.7547607421875, + "logps/rejected": -167.08209228515625, + "loss": 487.7689, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 30.501667022705078, + "rewards/margins": -12.195757865905762, + "rewards/rejected": 42.697425842285156, + "step": 4710 + }, + { + "epoch": 0.9263984298331698, + "grad_norm": 4194.9838878661885, + "learning_rate": 8.207349164023047e-09, + "logits/chosen": -2.4237990379333496, + "logits/rejected": -2.501603126525879, + "logps/chosen": -227.91152954101562, + "logps/rejected": -213.74789428710938, + "loss": 416.369, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 36.72782897949219, + "rewards/margins": -10.737937927246094, + "rewards/rejected": 47.46576690673828, + "step": 4720 + }, + { + "epoch": 0.9283611383709519, + "grad_norm": 4906.187260216281, + "learning_rate": 7.777714069399532e-09, + "logits/chosen": -2.566519260406494, + "logits/rejected": -2.4378700256347656, + "logps/chosen": -234.2556610107422, + "logps/rejected": -213.920166015625, + "loss": 469.0785, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 39.02256393432617, + "rewards/margins": -1.7270667552947998, + "rewards/rejected": 40.74962615966797, + "step": 4730 + }, + { + "epoch": 0.930323846908734, + "grad_norm": 4787.2292795122075, + "learning_rate": 7.359450886104263e-09, + "logits/chosen": -2.572117567062378, + "logits/rejected": -2.478353500366211, + "logps/chosen": -258.222900390625, + "logps/rejected": -196.6918487548828, + "loss": 399.8398, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 39.50740432739258, + "rewards/margins": -6.970947265625, + "rewards/rejected": 46.47834777832031, + "step": 4740 + }, + { + "epoch": 0.9322865554465162, + "grad_norm": 4652.016685552805, + "learning_rate": 6.9525792508597634e-09, + "logits/chosen": -2.7576687335968018, + "logits/rejected": -2.7529947757720947, + "logps/chosen": -238.3095703125, + "logps/rejected": -252.69491577148438, + "loss": 448.3941, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 50.65209197998047, + "rewards/margins": 16.916269302368164, + "rewards/rejected": 33.73582077026367, + "step": 4750 + }, + { + "epoch": 0.9342492639842983, + "grad_norm": 4341.304306349906, + "learning_rate": 6.557118265575451e-09, + "logits/chosen": -2.5280609130859375, + "logits/rejected": -2.5678343772888184, + "logps/chosen": -267.6020812988281, + "logps/rejected": -232.5404510498047, + "loss": 430.607, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 37.56711959838867, + "rewards/margins": -5.861638069152832, + "rewards/rejected": 43.42876052856445, + "step": 4760 + }, + { + "epoch": 0.9362119725220804, + "grad_norm": 5059.8930667799805, + "learning_rate": 6.1730864964507636e-09, + "logits/chosen": -2.683964252471924, + "logits/rejected": -2.538910388946533, + "logps/chosen": -254.32638549804688, + "logps/rejected": -191.3366241455078, + "loss": 443.235, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 42.44365692138672, + "rewards/margins": -5.388759613037109, + "rewards/rejected": 47.83241653442383, + "step": 4770 + }, + { + "epoch": 0.9381746810598626, + "grad_norm": 4955.471916279354, + "learning_rate": 5.8005019731033615e-09, + "logits/chosen": -2.6290130615234375, + "logits/rejected": -2.549903392791748, + "logps/chosen": -240.4611053466797, + "logps/rejected": -189.25515747070312, + "loss": 433.3238, + "rewards/accuracies": 0.5, + "rewards/chosen": 39.42809295654297, + "rewards/margins": 5.345437049865723, + "rewards/rejected": 34.08266067504883, + "step": 4780 + }, + { + "epoch": 0.9401373895976447, + "grad_norm": 4626.054506926014, + "learning_rate": 5.439382187722968e-09, + "logits/chosen": -2.818596124649048, + "logits/rejected": -2.6862807273864746, + "logps/chosen": -343.7168273925781, + "logps/rejected": -248.85122680664062, + "loss": 455.7912, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 39.650245666503906, + "rewards/margins": 2.1781814098358154, + "rewards/rejected": 37.472068786621094, + "step": 4790 + }, + { + "epoch": 0.9421000981354269, + "grad_norm": 4852.35621544925, + "learning_rate": 5.089744094249837e-09, + "logits/chosen": -2.8111395835876465, + "logits/rejected": -2.5496573448181152, + "logps/chosen": -319.8478088378906, + "logps/rejected": -225.66421508789062, + "loss": 448.5634, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 50.069862365722656, + "rewards/margins": 11.557438850402832, + "rewards/rejected": 38.512420654296875, + "step": 4800 + }, + { + "epoch": 0.9440628066732091, + "grad_norm": 4371.655540404004, + "learning_rate": 4.751604107579077e-09, + "logits/chosen": -2.7233150005340576, + "logits/rejected": -2.6152617931365967, + "logps/chosen": -232.55117797851562, + "logps/rejected": -210.745361328125, + "loss": 452.9889, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 51.05016326904297, + "rewards/margins": 15.190671920776367, + "rewards/rejected": 35.85948944091797, + "step": 4810 + }, + { + "epoch": 0.9460255152109912, + "grad_norm": 5385.241820900966, + "learning_rate": 4.424978102789661e-09, + "logits/chosen": -2.448641300201416, + "logits/rejected": -2.389498233795166, + "logps/chosen": -313.8994445800781, + "logps/rejected": -198.6295623779297, + "loss": 497.7832, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 92.62660217285156, + "rewards/margins": 39.63528060913086, + "rewards/rejected": 52.99132537841797, + "step": 4820 + }, + { + "epoch": 0.9479882237487733, + "grad_norm": 4912.7958917731285, + "learning_rate": 4.109881414399524e-09, + "logits/chosen": -2.6708881855010986, + "logits/rejected": -2.621290683746338, + "logps/chosen": -241.0809783935547, + "logps/rejected": -248.0062255859375, + "loss": 516.8406, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 37.55589294433594, + "rewards/margins": 4.549037933349609, + "rewards/rejected": 33.00685501098633, + "step": 4830 + }, + { + "epoch": 0.9499509322865555, + "grad_norm": 5368.909774149902, + "learning_rate": 3.806328835645272e-09, + "logits/chosen": -2.4885942935943604, + "logits/rejected": -2.4917941093444824, + "logps/chosen": -208.9341278076172, + "logps/rejected": -193.43490600585938, + "loss": 475.2958, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 32.45588302612305, + "rewards/margins": -3.4110305309295654, + "rewards/rejected": 35.866912841796875, + "step": 4840 + }, + { + "epoch": 0.9519136408243376, + "grad_norm": 4661.5821603670565, + "learning_rate": 3.5143346177878565e-09, + "logits/chosen": -2.6988439559936523, + "logits/rejected": -2.6492531299591064, + "logps/chosen": -327.31988525390625, + "logps/rejected": -232.4467315673828, + "loss": 494.2165, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 41.69511413574219, + "rewards/margins": 4.302943229675293, + "rewards/rejected": 37.39216995239258, + "step": 4850 + }, + { + "epoch": 0.9538763493621197, + "grad_norm": 4980.677871646419, + "learning_rate": 3.233912469443545e-09, + "logits/chosen": -2.5953526496887207, + "logits/rejected": -2.442674160003662, + "logps/chosen": -279.9731750488281, + "logps/rejected": -181.50048828125, + "loss": 487.4147, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 47.79022979736328, + "rewards/margins": 9.6940336227417, + "rewards/rejected": 38.09619903564453, + "step": 4860 + }, + { + "epoch": 0.9558390578999019, + "grad_norm": 4664.751420879302, + "learning_rate": 2.9650755559401388e-09, + "logits/chosen": -2.470555543899536, + "logits/rejected": -2.4070048332214355, + "logps/chosen": -276.84552001953125, + "logps/rejected": -245.06423950195312, + "loss": 445.485, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 70.07025909423828, + "rewards/margins": 1.3178901672363281, + "rewards/rejected": 68.75236511230469, + "step": 4870 + }, + { + "epoch": 0.957801766437684, + "grad_norm": 4302.0676887743775, + "learning_rate": 2.7078364986990175e-09, + "logits/chosen": -2.494755268096924, + "logits/rejected": -2.3925018310546875, + "logps/chosen": -353.33343505859375, + "logps/rejected": -268.85394287109375, + "loss": 474.6627, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 42.405967712402344, + "rewards/margins": 1.3862206935882568, + "rewards/rejected": 41.01974105834961, + "step": 4880 + }, + { + "epoch": 0.9597644749754661, + "grad_norm": 3888.461730316749, + "learning_rate": 2.4622073746426165e-09, + "logits/chosen": -2.5994725227355957, + "logits/rejected": -2.5952556133270264, + "logps/chosen": -236.3902587890625, + "logps/rejected": -188.7119598388672, + "loss": 450.1432, + "rewards/accuracies": 0.5, + "rewards/chosen": 37.518062591552734, + "rewards/margins": 1.0947520732879639, + "rewards/rejected": 36.42330551147461, + "step": 4890 + }, + { + "epoch": 0.9617271835132483, + "grad_norm": 4381.720003872811, + "learning_rate": 2.2281997156273213e-09, + "logits/chosen": -2.6201536655426025, + "logits/rejected": -2.5877933502197266, + "logps/chosen": -266.6863708496094, + "logps/rejected": -211.7058563232422, + "loss": 494.6588, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 45.659767150878906, + "rewards/margins": -1.3253047466278076, + "rewards/rejected": 46.985069274902344, + "step": 4900 + }, + { + "epoch": 0.9636898920510304, + "grad_norm": 3346.892913249055, + "learning_rate": 2.0058245079021265e-09, + "logits/chosen": -2.630370616912842, + "logits/rejected": -2.5561165809631348, + "logps/chosen": -220.84243774414062, + "logps/rejected": -161.4556121826172, + "loss": 440.1361, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": 38.90265655517578, + "rewards/margins": -1.657293677330017, + "rewards/rejected": 40.55995178222656, + "step": 4910 + }, + { + "epoch": 0.9656526005888125, + "grad_norm": 5128.958349199915, + "learning_rate": 1.7950921915928784e-09, + "logits/chosen": -2.450956106185913, + "logits/rejected": -2.4342901706695557, + "logps/chosen": -206.73245239257812, + "logps/rejected": -181.1966094970703, + "loss": 418.8285, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": 58.6185417175293, + "rewards/margins": 28.910715103149414, + "rewards/rejected": 29.70783042907715, + "step": 4920 + }, + { + "epoch": 0.9676153091265947, + "grad_norm": 5099.162044042663, + "learning_rate": 1.596012660212087e-09, + "logits/chosen": -2.6458563804626465, + "logits/rejected": -2.554788589477539, + "logps/chosen": -279.41925048828125, + "logps/rejected": -185.72958374023438, + "loss": 513.4474, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 39.27458953857422, + "rewards/margins": 5.483712196350098, + "rewards/rejected": 33.79088592529297, + "step": 4930 + }, + { + "epoch": 0.9695780176643768, + "grad_norm": 4551.279045211256, + "learning_rate": 1.408595260194434e-09, + "logits/chosen": -2.5972819328308105, + "logits/rejected": -2.4921669960021973, + "logps/chosen": -277.7696838378906, + "logps/rejected": -167.14578247070312, + "loss": 414.1692, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 37.349212646484375, + "rewards/margins": 3.4034876823425293, + "rewards/rejected": 33.94572067260742, + "step": 4940 + }, + { + "epoch": 0.971540726202159, + "grad_norm": 3773.680808520389, + "learning_rate": 1.2328487904580131e-09, + "logits/chosen": -2.6180472373962402, + "logits/rejected": -2.4990921020507812, + "logps/chosen": -174.1087646484375, + "logps/rejected": -185.30502319335938, + "loss": 414.0831, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 39.71233367919922, + "rewards/margins": 5.6509785652160645, + "rewards/rejected": 34.06135177612305, + "step": 4950 + }, + { + "epoch": 0.9735034347399412, + "grad_norm": 4881.977965503595, + "learning_rate": 1.0687815019912173e-09, + "logits/chosen": -2.5493404865264893, + "logits/rejected": -2.513451337814331, + "logps/chosen": -231.9395294189453, + "logps/rejected": -286.641845703125, + "loss": 508.2021, + "rewards/accuracies": 0.5, + "rewards/chosen": 48.53483200073242, + "rewards/margins": 3.981222629547119, + "rewards/rejected": 44.553611755371094, + "step": 4960 + }, + { + "epoch": 0.9754661432777233, + "grad_norm": 3823.13046567604, + "learning_rate": 9.164010974653802e-10, + "logits/chosen": -2.585801362991333, + "logits/rejected": -2.5278539657592773, + "logps/chosen": -229.3662109375, + "logps/rejected": -232.8991241455078, + "loss": 405.6757, + "rewards/accuracies": 0.4666666090488434, + "rewards/chosen": 38.5717658996582, + "rewards/margins": -1.0833070278167725, + "rewards/rejected": 39.655067443847656, + "step": 4970 + }, + { + "epoch": 0.9774288518155054, + "grad_norm": 4001.149224700046, + "learning_rate": 7.757147308731504e-10, + "logits/chosen": -2.591248035430908, + "logits/rejected": -2.4500527381896973, + "logps/chosen": -262.47003173828125, + "logps/rejected": -236.8359832763672, + "loss": 429.9303, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 45.33655548095703, + "rewards/margins": 12.93742561340332, + "rewards/rejected": 32.39912796020508, + "step": 4980 + }, + { + "epoch": 0.9793915603532876, + "grad_norm": 4285.35894614754, + "learning_rate": 6.467290071925646e-10, + "logits/chosen": -2.4244279861450195, + "logits/rejected": -2.5194430351257324, + "logps/chosen": -187.13661193847656, + "logps/rejected": -179.65377807617188, + "loss": 471.227, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 26.85833168029785, + "rewards/margins": -15.445712089538574, + "rewards/rejected": 42.304039001464844, + "step": 4990 + }, + { + "epoch": 0.9813542688910697, + "grad_norm": 4527.552369700591, + "learning_rate": 5.29449982077046e-10, + "logits/chosen": -2.6375205516815186, + "logits/rejected": -2.586085796356201, + "logps/chosen": -246.67724609375, + "logps/rejected": -176.51097106933594, + "loss": 446.9356, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 37.850624084472656, + "rewards/margins": 0.24829140305519104, + "rewards/rejected": 37.602333068847656, + "step": 5000 + }, + { + "epoch": 0.9833169774288518, + "grad_norm": 5452.2882899497845, + "learning_rate": 4.2388316157104806e-10, + "logits/chosen": -2.5154407024383545, + "logits/rejected": -2.468649387359619, + "logps/chosen": -233.3128662109375, + "logps/rejected": -186.05923461914062, + "loss": 497.3842, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 34.57950973510742, + "rewards/margins": -0.713744044303894, + "rewards/rejected": 35.293251037597656, + "step": 5010 + }, + { + "epoch": 0.985279685966634, + "grad_norm": 4683.8869096540675, + "learning_rate": 3.300335018515676e-10, + "logits/chosen": -2.5799403190612793, + "logits/rejected": -2.5058860778808594, + "logps/chosen": -176.56678771972656, + "logps/rejected": -124.17610168457031, + "loss": 461.8412, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 37.2112922668457, + "rewards/margins": -3.3934874534606934, + "rewards/rejected": 40.60478210449219, + "step": 5020 + }, + { + "epoch": 0.9872423945044161, + "grad_norm": 4264.237376017354, + "learning_rate": 2.4790540899546907e-10, + "logits/chosen": -2.508977174758911, + "logits/rejected": -2.5530219078063965, + "logps/chosen": -184.57777404785156, + "logps/rejected": -225.44070434570312, + "loss": 435.1646, + "rewards/accuracies": 0.36666667461395264, + "rewards/chosen": 37.472877502441406, + "rewards/margins": -12.762925148010254, + "rewards/rejected": 50.23580551147461, + "step": 5030 + }, + { + "epoch": 0.9892051030421982, + "grad_norm": 3979.2806515385905, + "learning_rate": 1.7750273877262244e-10, + "logits/chosen": -2.5312910079956055, + "logits/rejected": -2.4877536296844482, + "logps/chosen": -250.9333953857422, + "logps/rejected": -216.4891357421875, + "loss": 438.17, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 37.162452697753906, + "rewards/margins": -7.625374794006348, + "rewards/rejected": 44.7878303527832, + "step": 5040 + }, + { + "epoch": 0.9911678115799804, + "grad_norm": 4453.834332740248, + "learning_rate": 1.1882879646485379e-10, + "logits/chosen": -2.4382872581481934, + "logits/rejected": -2.3793227672576904, + "logps/chosen": -180.4657745361328, + "logps/rejected": -177.47760009765625, + "loss": 477.8574, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 44.736907958984375, + "rewards/margins": 6.490880012512207, + "rewards/rejected": 38.24602127075195, + "step": 5050 + }, + { + "epoch": 0.9931305201177625, + "grad_norm": 4887.6266129665655, + "learning_rate": 7.188633671079136e-11, + "logits/chosen": -2.6609623432159424, + "logits/rejected": -2.531956434249878, + "logps/chosen": -239.935546875, + "logps/rejected": -164.88461303710938, + "loss": 430.8216, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": 48.27210235595703, + "rewards/margins": 3.368035078048706, + "rewards/rejected": 44.9040641784668, + "step": 5060 + }, + { + "epoch": 0.9950932286555446, + "grad_norm": 5751.1482161869735, + "learning_rate": 3.6677563376580344e-11, + "logits/chosen": -2.605489730834961, + "logits/rejected": -2.559293270111084, + "logps/chosen": -228.0939178466797, + "logps/rejected": -279.6558837890625, + "loss": 497.0193, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": 36.726707458496094, + "rewards/margins": 5.8451924324035645, + "rewards/rejected": 30.881515502929688, + "step": 5070 + }, + { + "epoch": 0.9970559371933267, + "grad_norm": 3951.8547590293524, + "learning_rate": 1.3204129452354385e-11, + "logits/chosen": -2.562615394592285, + "logits/rejected": -2.5046939849853516, + "logps/chosen": -234.50192260742188, + "logps/rejected": -236.96835327148438, + "loss": 461.9209, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 36.579288482666016, + "rewards/margins": -11.703116416931152, + "rewards/rejected": 48.282405853271484, + "step": 5080 + }, + { + "epoch": 0.9990186457311089, + "grad_norm": 5081.291469555591, + "learning_rate": 1.467136974631078e-12, + "logits/chosen": -2.6177847385406494, + "logits/rejected": -2.4632296562194824, + "logps/chosen": -227.45236206054688, + "logps/rejected": -167.18699645996094, + "loss": 430.0306, + "rewards/accuracies": 0.46666663885116577, + "rewards/chosen": 44.46977996826172, + "rewards/margins": 9.820869445800781, + "rewards/rejected": 34.64891052246094, + "step": 5090 + }, + { + "epoch": 1.0, + "step": 5095, + "total_flos": 0.0, + "train_loss": 457.40178580242, + "train_runtime": 17501.0679, + "train_samples_per_second": 3.493, + "train_steps_per_second": 0.291 + } + ], + "logging_steps": 10, + "max_steps": 5095, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}