diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23554 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 7000, + "global_step": 16770, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.981514609421586e-09, + "logits/chosen": -3.0218403339385986, + "logits/rejected": -2.940047025680542, + "logps/chosen": -73.02317810058594, + "logps/rejected": -48.23734664916992, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 2.9815146094215865e-08, + "logits/chosen": -2.9619266986846924, + "logits/rejected": -2.9563283920288086, + "logps/chosen": -62.74790954589844, + "logps/rejected": -44.95586395263672, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00012704107211902738, + "rewards/margins": 0.0005221219034865499, + "rewards/rejected": -0.0006491629173979163, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 5.963029218843173e-08, + "logits/chosen": -2.960465908050537, + "logits/rejected": -2.9539878368377686, + "logps/chosen": -75.64433288574219, + "logps/rejected": -44.491546630859375, + "loss": 0.6932, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00027207276434637606, + "rewards/margins": -0.00038699532160535455, + "rewards/rejected": 0.00011492250632727519, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 8.94454382826476e-08, + "logits/chosen": -2.9388415813446045, + "logits/rejected": -2.9425132274627686, + "logps/chosen": -69.98499298095703, + "logps/rejected": -44.00871276855469, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.000582454726099968, + "rewards/margins": 0.00027978423167951405, + "rewards/rejected": 0.0003026704944204539, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 1.1926058437686346e-07, + "logits/chosen": -2.9837682247161865, + "logits/rejected": -2.947817325592041, + "logps/chosen": -69.46333312988281, + "logps/rejected": -42.51030731201172, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00010345459304517135, + "rewards/margins": 5.823614264954813e-05, + "rewards/rejected": 4.521848677541129e-05, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 1.490757304710793e-07, + "logits/chosen": -2.930995225906372, + "logits/rejected": -2.9113659858703613, + "logps/chosen": -68.5018310546875, + "logps/rejected": -44.30644989013672, + "loss": 0.6932, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0005911254556849599, + "rewards/margins": -2.3979193429113366e-05, + "rewards/rejected": -0.0005671462859027088, + "step": 50 + }, + { + "epoch": 0.0, + "learning_rate": 1.788908765652952e-07, + "logits/chosen": -3.00565505027771, + "logits/rejected": -2.988889217376709, + "logps/chosen": -70.67076110839844, + "logps/rejected": -45.866981506347656, + "loss": 0.6931, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00041712570236995816, + "rewards/margins": 7.655953231733292e-05, + "rewards/rejected": 0.00034056618460454047, + "step": 60 + }, + { + "epoch": 0.0, + "learning_rate": 2.0870602265951104e-07, + "logits/chosen": -2.985520124435425, + "logits/rejected": -2.947092056274414, + "logps/chosen": -72.18915557861328, + "logps/rejected": -45.6812858581543, + "loss": 0.6927, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 7.899569027358666e-05, + "rewards/margins": 0.0009106778306886554, + "rewards/rejected": -0.0008316821185871959, + "step": 70 + }, + { + "epoch": 0.0, + "learning_rate": 2.385211687537269e-07, + "logits/chosen": -2.9711382389068604, + "logits/rejected": -2.93875789642334, + "logps/chosen": -71.97798156738281, + "logps/rejected": -44.88883972167969, + "loss": 0.6923, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0008017882937565446, + "rewards/margins": 0.0018558672163635492, + "rewards/rejected": -0.0010540790390223265, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 2.6833631484794277e-07, + "logits/chosen": -2.9703755378723145, + "logits/rejected": -2.9743106365203857, + "logps/chosen": -75.0093994140625, + "logps/rejected": -44.32701873779297, + "loss": 0.6921, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0006831464124843478, + "rewards/margins": 0.0019017761806026101, + "rewards/rejected": -0.0012186297681182623, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 2.981514609421586e-07, + "logits/chosen": -2.9895684719085693, + "logits/rejected": -3.004122734069824, + "logps/chosen": -71.56358337402344, + "logps/rejected": -45.33041000366211, + "loss": 0.6915, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0017809504643082619, + "rewards/margins": 0.003538265125826001, + "rewards/rejected": -0.0017573146615177393, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 3.2796660703637447e-07, + "logits/chosen": -2.963571071624756, + "logits/rejected": -2.928189516067505, + "logps/chosen": -69.69255065917969, + "logps/rejected": -44.475135803222656, + "loss": 0.6909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0015576332807540894, + "rewards/margins": 0.0045606764033436775, + "rewards/rejected": -0.003003043122589588, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 3.577817531305904e-07, + "logits/chosen": -2.974503993988037, + "logits/rejected": -2.9577651023864746, + "logps/chosen": -67.06887817382812, + "logps/rejected": -45.446311950683594, + "loss": 0.6905, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0023120190016925335, + "rewards/margins": 0.005582844372838736, + "rewards/rejected": -0.003270825371146202, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 3.8759689922480623e-07, + "logits/chosen": -2.995654344558716, + "logits/rejected": -2.973827600479126, + "logps/chosen": -64.65982055664062, + "logps/rejected": -44.023311614990234, + "loss": 0.6897, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0025220434181392193, + "rewards/margins": 0.006847357843071222, + "rewards/rejected": -0.004325315356254578, + "step": 130 + }, + { + "epoch": 0.01, + "learning_rate": 4.174120453190221e-07, + "logits/chosen": -2.967301845550537, + "logits/rejected": -2.9388742446899414, + "logps/chosen": -71.3984603881836, + "logps/rejected": -44.561988830566406, + "loss": 0.6883, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0037331501953303814, + "rewards/margins": 0.009782666340470314, + "rewards/rejected": -0.006049515679478645, + "step": 140 + }, + { + "epoch": 0.01, + "learning_rate": 4.47227191413238e-07, + "logits/chosen": -2.9889566898345947, + "logits/rejected": -2.9502789974212646, + "logps/chosen": -73.5901870727539, + "logps/rejected": -44.54457473754883, + "loss": 0.6871, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005121590569615364, + "rewards/margins": 0.012893019244074821, + "rewards/rejected": -0.007771429605782032, + "step": 150 + }, + { + "epoch": 0.01, + "learning_rate": 4.770423375074538e-07, + "logits/chosen": -2.997464895248413, + "logits/rejected": -2.973012924194336, + "logps/chosen": -67.8456802368164, + "logps/rejected": -44.57398223876953, + "loss": 0.6861, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005220805760473013, + "rewards/margins": 0.014461624436080456, + "rewards/rejected": -0.00924081914126873, + "step": 160 + }, + { + "epoch": 0.01, + "learning_rate": 5.068574836016696e-07, + "logits/chosen": -2.954655885696411, + "logits/rejected": -2.938756227493286, + "logps/chosen": -74.64299011230469, + "logps/rejected": -44.95725631713867, + "loss": 0.6845, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006698357407003641, + "rewards/margins": 0.0172797292470932, + "rewards/rejected": -0.010581372305750847, + "step": 170 + }, + { + "epoch": 0.01, + "learning_rate": 5.366726296958855e-07, + "logits/chosen": -2.9657092094421387, + "logits/rejected": -2.942619800567627, + "logps/chosen": -72.10955810546875, + "logps/rejected": -45.92462921142578, + "loss": 0.6827, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007762356661260128, + "rewards/margins": 0.02173658087849617, + "rewards/rejected": -0.013974225148558617, + "step": 180 + }, + { + "epoch": 0.01, + "learning_rate": 5.664877757901014e-07, + "logits/chosen": -3.0228991508483887, + "logits/rejected": -2.971086025238037, + "logps/chosen": -76.38833618164062, + "logps/rejected": -47.514808654785156, + "loss": 0.6804, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010731477290391922, + "rewards/margins": 0.027403127402067184, + "rewards/rejected": -0.01667165383696556, + "step": 190 + }, + { + "epoch": 0.01, + "learning_rate": 5.963029218843172e-07, + "logits/chosen": -2.9411473274230957, + "logits/rejected": -2.9228625297546387, + "logps/chosen": -70.54524230957031, + "logps/rejected": -45.080833435058594, + "loss": 0.6783, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01168130338191986, + "rewards/margins": 0.03029986284673214, + "rewards/rejected": -0.018618561327457428, + "step": 200 + }, + { + "epoch": 0.01, + "learning_rate": 6.26118067978533e-07, + "logits/chosen": -2.9853718280792236, + "logits/rejected": -2.964855670928955, + "logps/chosen": -65.7616195678711, + "logps/rejected": -45.29457473754883, + "loss": 0.6759, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.012055915780365467, + "rewards/margins": 0.03174392506480217, + "rewards/rejected": -0.019688012078404427, + "step": 210 + }, + { + "epoch": 0.01, + "learning_rate": 6.559332140727489e-07, + "logits/chosen": -2.9891045093536377, + "logits/rejected": -2.9741458892822266, + "logps/chosen": -76.55693054199219, + "logps/rejected": -47.39777755737305, + "loss": 0.6716, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018879849463701248, + "rewards/margins": 0.045966826379299164, + "rewards/rejected": -0.027086973190307617, + "step": 220 + }, + { + "epoch": 0.01, + "learning_rate": 6.857483601669648e-07, + "logits/chosen": -2.9521539211273193, + "logits/rejected": -2.9273805618286133, + "logps/chosen": -73.84449005126953, + "logps/rejected": -47.50820541381836, + "loss": 0.6706, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018496818840503693, + "rewards/margins": 0.04870253801345825, + "rewards/rejected": -0.030205722898244858, + "step": 230 + }, + { + "epoch": 0.01, + "learning_rate": 7.155635062611808e-07, + "logits/chosen": -2.990572690963745, + "logits/rejected": -2.996241569519043, + "logps/chosen": -69.92765808105469, + "logps/rejected": -47.41942596435547, + "loss": 0.666, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01791529916226864, + "rewards/margins": 0.055198751389980316, + "rewards/rejected": -0.037283457815647125, + "step": 240 + }, + { + "epoch": 0.01, + "learning_rate": 7.453786523553966e-07, + "logits/chosen": -2.9767189025878906, + "logits/rejected": -2.9706878662109375, + "logps/chosen": -70.4488525390625, + "logps/rejected": -49.38618469238281, + "loss": 0.6655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01426013559103012, + "rewards/margins": 0.055192362517118454, + "rewards/rejected": -0.04093223437666893, + "step": 250 + }, + { + "epoch": 0.02, + "learning_rate": 7.751937984496125e-07, + "logits/chosen": -2.9992268085479736, + "logits/rejected": -2.9801385402679443, + "logps/chosen": -71.62583923339844, + "logps/rejected": -48.7337646484375, + "loss": 0.6615, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.022342320531606674, + "rewards/margins": 0.062458496540784836, + "rewards/rejected": -0.04011617973446846, + "step": 260 + }, + { + "epoch": 0.02, + "learning_rate": 8.050089445438284e-07, + "logits/chosen": -2.9760518074035645, + "logits/rejected": -2.9756836891174316, + "logps/chosen": -69.02963256835938, + "logps/rejected": -47.99614715576172, + "loss": 0.6541, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03307611495256424, + "rewards/margins": 0.07970432192087173, + "rewards/rejected": -0.04662821814417839, + "step": 270 + }, + { + "epoch": 0.02, + "learning_rate": 8.348240906380442e-07, + "logits/chosen": -2.972804307937622, + "logits/rejected": -2.973416328430176, + "logps/chosen": -73.17513275146484, + "logps/rejected": -48.799110412597656, + "loss": 0.6536, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03613675758242607, + "rewards/margins": 0.0900457501411438, + "rewards/rejected": -0.05390900373458862, + "step": 280 + }, + { + "epoch": 0.02, + "learning_rate": 8.646392367322601e-07, + "logits/chosen": -2.988776683807373, + "logits/rejected": -2.9511101245880127, + "logps/chosen": -60.7444953918457, + "logps/rejected": -50.1580696105957, + "loss": 0.6511, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02103568986058235, + "rewards/margins": 0.07972956448793411, + "rewards/rejected": -0.05869387462735176, + "step": 290 + }, + { + "epoch": 0.02, + "learning_rate": 8.94454382826476e-07, + "logits/chosen": -2.9847099781036377, + "logits/rejected": -2.9715585708618164, + "logps/chosen": -71.0445327758789, + "logps/rejected": -50.474178314208984, + "loss": 0.6438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04130948334932327, + "rewards/margins": 0.1042775884270668, + "rewards/rejected": -0.06296811997890472, + "step": 300 + }, + { + "epoch": 0.02, + "learning_rate": 9.242695289206919e-07, + "logits/chosen": -2.991835117340088, + "logits/rejected": -2.986812114715576, + "logps/chosen": -67.70176696777344, + "logps/rejected": -51.52722930908203, + "loss": 0.637, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04277125000953674, + "rewards/margins": 0.11644142866134644, + "rewards/rejected": -0.0736701637506485, + "step": 310 + }, + { + "epoch": 0.02, + "learning_rate": 9.540846750149077e-07, + "logits/chosen": -2.982632875442505, + "logits/rejected": -2.9579172134399414, + "logps/chosen": -61.857505798339844, + "logps/rejected": -51.969200134277344, + "loss": 0.6312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04146171361207962, + "rewards/margins": 0.11927430331707001, + "rewards/rejected": -0.07781258970499039, + "step": 320 + }, + { + "epoch": 0.02, + "learning_rate": 9.838998211091236e-07, + "logits/chosen": -3.017160415649414, + "logits/rejected": -2.989964008331299, + "logps/chosen": -65.59489440917969, + "logps/rejected": -55.42946243286133, + "loss": 0.6189, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05441339686512947, + "rewards/margins": 0.1520194113254547, + "rewards/rejected": -0.09760601818561554, + "step": 330 + }, + { + "epoch": 0.02, + "learning_rate": 1.0137149672033393e-06, + "logits/chosen": -2.9803266525268555, + "logits/rejected": -2.9845376014709473, + "logps/chosen": -65.06605529785156, + "logps/rejected": -54.811424255371094, + "loss": 0.6096, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07225533574819565, + "rewards/margins": 0.18390336632728577, + "rewards/rejected": -0.11164804548025131, + "step": 340 + }, + { + "epoch": 0.02, + "learning_rate": 1.0435301132975552e-06, + "logits/chosen": -2.964911937713623, + "logits/rejected": -2.96053147315979, + "logps/chosen": -65.23785400390625, + "logps/rejected": -57.24370574951172, + "loss": 0.6011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.059714823961257935, + "rewards/margins": 0.18882234394550323, + "rewards/rejected": -0.1291075348854065, + "step": 350 + }, + { + "epoch": 0.02, + "learning_rate": 1.073345259391771e-06, + "logits/chosen": -2.974783420562744, + "logits/rejected": -2.96232271194458, + "logps/chosen": -64.95095825195312, + "logps/rejected": -60.1099853515625, + "loss": 0.5879, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0736270323395729, + "rewards/margins": 0.21903876960277557, + "rewards/rejected": -0.14541175961494446, + "step": 360 + }, + { + "epoch": 0.02, + "learning_rate": 1.103160405485987e-06, + "logits/chosen": -2.9943270683288574, + "logits/rejected": -2.9654793739318848, + "logps/chosen": -59.869659423828125, + "logps/rejected": -60.060264587402344, + "loss": 0.5776, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10907473415136337, + "rewards/margins": 0.25231292843818665, + "rewards/rejected": -0.14323820173740387, + "step": 370 + }, + { + "epoch": 0.02, + "learning_rate": 1.1329755515802029e-06, + "logits/chosen": -2.983459949493408, + "logits/rejected": -2.9790661334991455, + "logps/chosen": -68.365966796875, + "logps/rejected": -62.22774124145508, + "loss": 0.5663, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10884684324264526, + "rewards/margins": 0.27483612298965454, + "rewards/rejected": -0.1659892499446869, + "step": 380 + }, + { + "epoch": 0.02, + "learning_rate": 1.1627906976744188e-06, + "logits/chosen": -2.971078872680664, + "logits/rejected": -2.9647915363311768, + "logps/chosen": -54.592071533203125, + "logps/rejected": -65.0318832397461, + "loss": 0.5462, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11747223138809204, + "rewards/margins": 0.3192733824253082, + "rewards/rejected": -0.2018011510372162, + "step": 390 + }, + { + "epoch": 0.02, + "learning_rate": 1.1926058437686345e-06, + "logits/chosen": -2.9642443656921387, + "logits/rejected": -2.9650585651397705, + "logps/chosen": -56.22089385986328, + "logps/rejected": -65.07490539550781, + "loss": 0.5367, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14380133152008057, + "rewards/margins": 0.3455009460449219, + "rewards/rejected": -0.2016996145248413, + "step": 400 + }, + { + "epoch": 0.02, + "learning_rate": 1.2224209898628504e-06, + "logits/chosen": -2.9602420330047607, + "logits/rejected": -2.984884262084961, + "logps/chosen": -54.9572639465332, + "logps/rejected": -67.0857162475586, + "loss": 0.5246, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1513342559337616, + "rewards/margins": 0.3829588294029236, + "rewards/rejected": -0.2316245287656784, + "step": 410 + }, + { + "epoch": 0.03, + "learning_rate": 1.252236135957066e-06, + "logits/chosen": -2.984032154083252, + "logits/rejected": -2.960697650909424, + "logps/chosen": -53.50678253173828, + "logps/rejected": -70.7410659790039, + "loss": 0.5036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17459207773208618, + "rewards/margins": 0.4419061541557312, + "rewards/rejected": -0.26731401681900024, + "step": 420 + }, + { + "epoch": 0.03, + "learning_rate": 1.282051282051282e-06, + "logits/chosen": -3.006791591644287, + "logits/rejected": -2.9790358543395996, + "logps/chosen": -50.71213912963867, + "logps/rejected": -75.37886047363281, + "loss": 0.4926, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14115287363529205, + "rewards/margins": 0.438680499792099, + "rewards/rejected": -0.29752764105796814, + "step": 430 + }, + { + "epoch": 0.03, + "learning_rate": 1.3118664281454979e-06, + "logits/chosen": -2.9824748039245605, + "logits/rejected": -2.9848923683166504, + "logps/chosen": -51.81050491333008, + "logps/rejected": -74.18791961669922, + "loss": 0.4829, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19691374897956848, + "rewards/margins": 0.4909387230873108, + "rewards/rejected": -0.2940249443054199, + "step": 440 + }, + { + "epoch": 0.03, + "learning_rate": 1.3416815742397138e-06, + "logits/chosen": -2.954195737838745, + "logits/rejected": -2.957263946533203, + "logps/chosen": -48.250526428222656, + "logps/rejected": -73.97734069824219, + "loss": 0.4813, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17701885104179382, + "rewards/margins": 0.4652382731437683, + "rewards/rejected": -0.2882193922996521, + "step": 450 + }, + { + "epoch": 0.03, + "learning_rate": 1.3714967203339297e-06, + "logits/chosen": -3.00996732711792, + "logits/rejected": -3.002124309539795, + "logps/chosen": -46.57711410522461, + "logps/rejected": -80.85877990722656, + "loss": 0.4508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20726975798606873, + "rewards/margins": 0.5677331686019897, + "rewards/rejected": -0.36046338081359863, + "step": 460 + }, + { + "epoch": 0.03, + "learning_rate": 1.4013118664281456e-06, + "logits/chosen": -2.954026937484741, + "logits/rejected": -2.932624340057373, + "logps/chosen": -46.166603088378906, + "logps/rejected": -75.62085723876953, + "loss": 0.4484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1909331977367401, + "rewards/margins": 0.5219781398773193, + "rewards/rejected": -0.3310449421405792, + "step": 470 + }, + { + "epoch": 0.03, + "learning_rate": 1.4311270125223615e-06, + "logits/chosen": -2.980708360671997, + "logits/rejected": -2.976071834564209, + "logps/chosen": -47.06877899169922, + "logps/rejected": -85.34896850585938, + "loss": 0.4285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2043842375278473, + "rewards/margins": 0.6243103742599487, + "rewards/rejected": -0.41992610692977905, + "step": 480 + }, + { + "epoch": 0.03, + "learning_rate": 1.4609421586165772e-06, + "logits/chosen": -2.9911887645721436, + "logits/rejected": -2.943816661834717, + "logps/chosen": -46.52109146118164, + "logps/rejected": -87.4489517211914, + "loss": 0.4143, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23063774406909943, + "rewards/margins": 0.674059271812439, + "rewards/rejected": -0.4434216022491455, + "step": 490 + }, + { + "epoch": 0.03, + "learning_rate": 1.490757304710793e-06, + "logits/chosen": -2.9820945262908936, + "logits/rejected": -2.9838337898254395, + "logps/chosen": -54.989166259765625, + "logps/rejected": -90.59204864501953, + "loss": 0.3981, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23931631445884705, + "rewards/margins": 0.7045159339904785, + "rewards/rejected": -0.4651995599269867, + "step": 500 + }, + { + "epoch": 0.03, + "learning_rate": 1.520572450805009e-06, + "logits/chosen": -2.9805257320404053, + "logits/rejected": -2.9528656005859375, + "logps/chosen": -46.284706115722656, + "logps/rejected": -95.25877380371094, + "loss": 0.3966, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2438308745622635, + "rewards/margins": 0.7423723936080933, + "rewards/rejected": -0.49854153394699097, + "step": 510 + }, + { + "epoch": 0.03, + "learning_rate": 1.550387596899225e-06, + "logits/chosen": -2.9418282508850098, + "logits/rejected": -2.9414196014404297, + "logps/chosen": -48.37353515625, + "logps/rejected": -91.3024673461914, + "loss": 0.396, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2332138568162918, + "rewards/margins": 0.7105187773704529, + "rewards/rejected": -0.4773048758506775, + "step": 520 + }, + { + "epoch": 0.03, + "learning_rate": 1.5802027429934408e-06, + "logits/chosen": -2.9739184379577637, + "logits/rejected": -2.97147798538208, + "logps/chosen": -47.54608154296875, + "logps/rejected": -96.32991027832031, + "loss": 0.382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23890534043312073, + "rewards/margins": 0.7606258392333984, + "rewards/rejected": -0.5217204093933105, + "step": 530 + }, + { + "epoch": 0.03, + "learning_rate": 1.6100178890876567e-06, + "logits/chosen": -2.980813503265381, + "logits/rejected": -2.963414430618286, + "logps/chosen": -50.37831115722656, + "logps/rejected": -96.20535278320312, + "loss": 0.3834, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24395489692687988, + "rewards/margins": 0.7731548547744751, + "rewards/rejected": -0.5291999578475952, + "step": 540 + }, + { + "epoch": 0.03, + "learning_rate": 1.6398330351818726e-06, + "logits/chosen": -2.966637134552002, + "logits/rejected": -2.932892322540283, + "logps/chosen": -44.41714096069336, + "logps/rejected": -100.84266662597656, + "loss": 0.3651, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25574031472206116, + "rewards/margins": 0.8291279077529907, + "rewards/rejected": -0.5733876824378967, + "step": 550 + }, + { + "epoch": 0.03, + "learning_rate": 1.6696481812760883e-06, + "logits/chosen": -2.9682445526123047, + "logits/rejected": -2.9829764366149902, + "logps/chosen": -43.2005500793457, + "logps/rejected": -104.4155044555664, + "loss": 0.361, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24430926144123077, + "rewards/margins": 0.8398653268814087, + "rewards/rejected": -0.5955560803413391, + "step": 560 + }, + { + "epoch": 0.03, + "learning_rate": 1.6994633273703042e-06, + "logits/chosen": -3.0001132488250732, + "logits/rejected": -2.978228807449341, + "logps/chosen": -47.23787307739258, + "logps/rejected": -99.9131088256836, + "loss": 0.3547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28121238946914673, + "rewards/margins": 0.8462437391281128, + "rewards/rejected": -0.5650314092636108, + "step": 570 + }, + { + "epoch": 0.03, + "learning_rate": 1.7292784734645201e-06, + "logits/chosen": -2.9574625492095947, + "logits/rejected": -2.928567409515381, + "logps/chosen": -39.49331283569336, + "logps/rejected": -107.9442138671875, + "loss": 0.3386, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2691844701766968, + "rewards/margins": 0.9074603915214539, + "rewards/rejected": -0.6382759809494019, + "step": 580 + }, + { + "epoch": 0.04, + "learning_rate": 1.759093619558736e-06, + "logits/chosen": -2.9369075298309326, + "logits/rejected": -2.9088616371154785, + "logps/chosen": -42.70224380493164, + "logps/rejected": -110.30489349365234, + "loss": 0.3383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27101626992225647, + "rewards/margins": 0.9230680465698242, + "rewards/rejected": -0.6520518064498901, + "step": 590 + }, + { + "epoch": 0.04, + "learning_rate": 1.788908765652952e-06, + "logits/chosen": -2.930457592010498, + "logits/rejected": -2.91943097114563, + "logps/chosen": -43.339229583740234, + "logps/rejected": -107.48307800292969, + "loss": 0.3407, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25904375314712524, + "rewards/margins": 0.8748016357421875, + "rewards/rejected": -0.615757942199707, + "step": 600 + }, + { + "epoch": 0.04, + "learning_rate": 1.8187239117471678e-06, + "logits/chosen": -2.9587793350219727, + "logits/rejected": -2.9344980716705322, + "logps/chosen": -39.99372482299805, + "logps/rejected": -109.69963073730469, + "loss": 0.3289, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2784316837787628, + "rewards/margins": 0.9442272186279297, + "rewards/rejected": -0.6657954454421997, + "step": 610 + }, + { + "epoch": 0.04, + "learning_rate": 1.8485390578413837e-06, + "logits/chosen": -2.971811532974243, + "logits/rejected": -2.9104809761047363, + "logps/chosen": -38.652320861816406, + "logps/rejected": -113.38652038574219, + "loss": 0.3233, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2652958333492279, + "rewards/margins": 0.9442809820175171, + "rewards/rejected": -0.6789851188659668, + "step": 620 + }, + { + "epoch": 0.04, + "learning_rate": 1.8783542039355994e-06, + "logits/chosen": -2.9191136360168457, + "logits/rejected": -2.8827223777770996, + "logps/chosen": -42.28081512451172, + "logps/rejected": -119.88639831542969, + "loss": 0.306, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29589518904685974, + "rewards/margins": 1.0485247373580933, + "rewards/rejected": -0.7526295781135559, + "step": 630 + }, + { + "epoch": 0.04, + "learning_rate": 1.9081693500298153e-06, + "logits/chosen": -2.957550048828125, + "logits/rejected": -2.9400954246520996, + "logps/chosen": -34.615272521972656, + "logps/rejected": -121.06828308105469, + "loss": 0.303, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2980164587497711, + "rewards/margins": 1.0627772808074951, + "rewards/rejected": -0.7647607326507568, + "step": 640 + }, + { + "epoch": 0.04, + "learning_rate": 1.9379844961240315e-06, + "logits/chosen": -2.9378323554992676, + "logits/rejected": -2.9265735149383545, + "logps/chosen": -42.065208435058594, + "logps/rejected": -117.59049987792969, + "loss": 0.3034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27747949957847595, + "rewards/margins": 1.001206636428833, + "rewards/rejected": -0.7237271666526794, + "step": 650 + }, + { + "epoch": 0.04, + "learning_rate": 1.967799642218247e-06, + "logits/chosen": -2.9484689235687256, + "logits/rejected": -2.9170165061950684, + "logps/chosen": -38.046356201171875, + "logps/rejected": -125.5303955078125, + "loss": 0.2924, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3038211464881897, + "rewards/margins": 1.1033309698104858, + "rewards/rejected": -0.7995098233222961, + "step": 660 + }, + { + "epoch": 0.04, + "learning_rate": 1.997614788312463e-06, + "logits/chosen": -2.9291698932647705, + "logits/rejected": -2.913149356842041, + "logps/chosen": -40.59954071044922, + "logps/rejected": -127.72261047363281, + "loss": 0.28, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29420167207717896, + "rewards/margins": 1.1142350435256958, + "rewards/rejected": -0.8200333714485168, + "step": 670 + }, + { + "epoch": 0.04, + "learning_rate": 2.0274299344066785e-06, + "logits/chosen": -2.97379732131958, + "logits/rejected": -2.90913724899292, + "logps/chosen": -41.133975982666016, + "logps/rejected": -131.36251831054688, + "loss": 0.2704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2960251271724701, + "rewards/margins": 1.1637284755706787, + "rewards/rejected": -0.8677034378051758, + "step": 680 + }, + { + "epoch": 0.04, + "learning_rate": 2.0572450805008946e-06, + "logits/chosen": -2.957440137863159, + "logits/rejected": -2.902618885040283, + "logps/chosen": -38.768409729003906, + "logps/rejected": -135.829833984375, + "loss": 0.2545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32904133200645447, + "rewards/margins": 1.2436773777008057, + "rewards/rejected": -0.9146361351013184, + "step": 690 + }, + { + "epoch": 0.04, + "learning_rate": 2.0870602265951103e-06, + "logits/chosen": -2.9167263507843018, + "logits/rejected": -2.890842914581299, + "logps/chosen": -44.03474807739258, + "logps/rejected": -139.43023681640625, + "loss": 0.2503, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30538904666900635, + "rewards/margins": 1.2593690156936646, + "rewards/rejected": -0.9539799690246582, + "step": 700 + }, + { + "epoch": 0.04, + "learning_rate": 2.1168753726893265e-06, + "logits/chosen": -2.924889326095581, + "logits/rejected": -2.8740432262420654, + "logps/chosen": -40.46274185180664, + "logps/rejected": -145.13734436035156, + "loss": 0.2428, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2993396520614624, + "rewards/margins": 1.3159592151641846, + "rewards/rejected": -1.0166196823120117, + "step": 710 + }, + { + "epoch": 0.04, + "learning_rate": 2.146690518783542e-06, + "logits/chosen": -2.9009227752685547, + "logits/rejected": -2.9008853435516357, + "logps/chosen": -37.61160659790039, + "logps/rejected": -144.91842651367188, + "loss": 0.239, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2984068989753723, + "rewards/margins": 1.3076552152633667, + "rewards/rejected": -1.00924813747406, + "step": 720 + }, + { + "epoch": 0.04, + "learning_rate": 2.176505664877758e-06, + "logits/chosen": -2.9028007984161377, + "logits/rejected": -2.8786234855651855, + "logps/chosen": -38.436622619628906, + "logps/rejected": -151.5777130126953, + "loss": 0.2284, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3371207118034363, + "rewards/margins": 1.4050734043121338, + "rewards/rejected": -1.0679528713226318, + "step": 730 + }, + { + "epoch": 0.04, + "learning_rate": 2.206320810971974e-06, + "logits/chosen": -2.916623592376709, + "logits/rejected": -2.8668630123138428, + "logps/chosen": -39.584632873535156, + "logps/rejected": -150.6464080810547, + "loss": 0.2279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30783018469810486, + "rewards/margins": 1.3758673667907715, + "rewards/rejected": -1.0680371522903442, + "step": 740 + }, + { + "epoch": 0.04, + "learning_rate": 2.2361359570661897e-06, + "logits/chosen": -2.887272357940674, + "logits/rejected": -2.8534915447235107, + "logps/chosen": -36.246482849121094, + "logps/rejected": -153.67608642578125, + "loss": 0.2179, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3153519034385681, + "rewards/margins": 1.4092376232147217, + "rewards/rejected": -1.0938857793807983, + "step": 750 + }, + { + "epoch": 0.05, + "learning_rate": 2.2659511031604058e-06, + "logits/chosen": -2.8925223350524902, + "logits/rejected": -2.884477138519287, + "logps/chosen": -36.67290115356445, + "logps/rejected": -161.21798706054688, + "loss": 0.2039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3240107595920563, + "rewards/margins": 1.5021789073944092, + "rewards/rejected": -1.1781680583953857, + "step": 760 + }, + { + "epoch": 0.05, + "learning_rate": 2.2957662492546215e-06, + "logits/chosen": -2.88120698928833, + "logits/rejected": -2.802581310272217, + "logps/chosen": -40.013450622558594, + "logps/rejected": -159.5142822265625, + "loss": 0.2114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33028337359428406, + "rewards/margins": 1.4851129055023193, + "rewards/rejected": -1.154829502105713, + "step": 770 + }, + { + "epoch": 0.05, + "learning_rate": 2.3255813953488376e-06, + "logits/chosen": -2.8996691703796387, + "logits/rejected": -2.859790325164795, + "logps/chosen": -39.816078186035156, + "logps/rejected": -165.71630859375, + "loss": 0.2079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29694247245788574, + "rewards/margins": 1.5048564672470093, + "rewards/rejected": -1.207914113998413, + "step": 780 + }, + { + "epoch": 0.05, + "learning_rate": 2.3553965414430533e-06, + "logits/chosen": -2.9014010429382324, + "logits/rejected": -2.8186144828796387, + "logps/chosen": -36.43000793457031, + "logps/rejected": -166.2866973876953, + "loss": 0.1945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31615301966667175, + "rewards/margins": 1.5481258630752563, + "rewards/rejected": -1.231972575187683, + "step": 790 + }, + { + "epoch": 0.05, + "learning_rate": 2.385211687537269e-06, + "logits/chosen": -2.9162821769714355, + "logits/rejected": -2.829958915710449, + "logps/chosen": -41.460723876953125, + "logps/rejected": -172.53224182128906, + "loss": 0.1916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32863444089889526, + "rewards/margins": 1.5999221801757812, + "rewards/rejected": -1.2712876796722412, + "step": 800 + }, + { + "epoch": 0.05, + "learning_rate": 2.415026833631485e-06, + "logits/chosen": -2.9372239112854004, + "logits/rejected": -2.848865032196045, + "logps/chosen": -39.65186309814453, + "logps/rejected": -167.17933654785156, + "loss": 0.1949, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31408971548080444, + "rewards/margins": 1.5402828454971313, + "rewards/rejected": -1.2261930704116821, + "step": 810 + }, + { + "epoch": 0.05, + "learning_rate": 2.4448419797257008e-06, + "logits/chosen": -2.9475319385528564, + "logits/rejected": -2.8369367122650146, + "logps/chosen": -42.22636032104492, + "logps/rejected": -178.23306274414062, + "loss": 0.1866, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31575629115104675, + "rewards/margins": 1.648012399673462, + "rewards/rejected": -1.3322560787200928, + "step": 820 + }, + { + "epoch": 0.05, + "learning_rate": 2.474657125819917e-06, + "logits/chosen": -2.9107608795166016, + "logits/rejected": -2.8390917778015137, + "logps/chosen": -39.65138244628906, + "logps/rejected": -177.58013916015625, + "loss": 0.1773, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.312017023563385, + "rewards/margins": 1.658429503440857, + "rewards/rejected": -1.3464124202728271, + "step": 830 + }, + { + "epoch": 0.05, + "learning_rate": 2.504472271914132e-06, + "logits/chosen": -2.923290729522705, + "logits/rejected": -2.829624652862549, + "logps/chosen": -36.920780181884766, + "logps/rejected": -183.22515869140625, + "loss": 0.172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3202868103981018, + "rewards/margins": 1.7096493244171143, + "rewards/rejected": -1.3893625736236572, + "step": 840 + }, + { + "epoch": 0.05, + "learning_rate": 2.5342874180083483e-06, + "logits/chosen": -2.9353766441345215, + "logits/rejected": -2.8816449642181396, + "logps/chosen": -44.964141845703125, + "logps/rejected": -185.05551147460938, + "loss": 0.1768, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2919327914714813, + "rewards/margins": 1.7024953365325928, + "rewards/rejected": -1.4105623960494995, + "step": 850 + }, + { + "epoch": 0.05, + "learning_rate": 2.564102564102564e-06, + "logits/chosen": -2.879239797592163, + "logits/rejected": -2.8295459747314453, + "logps/chosen": -40.0316162109375, + "logps/rejected": -181.7423553466797, + "loss": 0.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31078919768333435, + "rewards/margins": 1.6821924448013306, + "rewards/rejected": -1.3714032173156738, + "step": 860 + }, + { + "epoch": 0.05, + "learning_rate": 2.59391771019678e-06, + "logits/chosen": -2.9260551929473877, + "logits/rejected": -2.8518142700195312, + "logps/chosen": -34.84546661376953, + "logps/rejected": -186.55203247070312, + "loss": 0.1601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3088172972202301, + "rewards/margins": 1.7361927032470703, + "rewards/rejected": -1.4273754358291626, + "step": 870 + }, + { + "epoch": 0.05, + "learning_rate": 2.6237328562909958e-06, + "logits/chosen": -2.9526896476745605, + "logits/rejected": -2.864396333694458, + "logps/chosen": -46.93625259399414, + "logps/rejected": -195.2786407470703, + "loss": 0.1569, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3179161548614502, + "rewards/margins": 1.8251888751983643, + "rewards/rejected": -1.507272481918335, + "step": 880 + }, + { + "epoch": 0.05, + "learning_rate": 2.653548002385212e-06, + "logits/chosen": -2.946425199508667, + "logits/rejected": -2.8477015495300293, + "logps/chosen": -44.20731735229492, + "logps/rejected": -191.15806579589844, + "loss": 0.1558, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2844986021518707, + "rewards/margins": 1.7495861053466797, + "rewards/rejected": -1.465087652206421, + "step": 890 + }, + { + "epoch": 0.05, + "learning_rate": 2.6833631484794276e-06, + "logits/chosen": -2.908036231994629, + "logits/rejected": -2.848543643951416, + "logps/chosen": -42.08452606201172, + "logps/rejected": -193.06002807617188, + "loss": 0.1695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27534404397010803, + "rewards/margins": 1.7566055059432983, + "rewards/rejected": -1.4812614917755127, + "step": 900 + }, + { + "epoch": 0.05, + "learning_rate": 2.7131782945736433e-06, + "logits/chosen": -2.9223923683166504, + "logits/rejected": -2.842595100402832, + "logps/chosen": -45.47929763793945, + "logps/rejected": -196.42083740234375, + "loss": 0.1652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2668255567550659, + "rewards/margins": 1.7994037866592407, + "rewards/rejected": -1.5325781106948853, + "step": 910 + }, + { + "epoch": 0.05, + "learning_rate": 2.7429934406678594e-06, + "logits/chosen": -2.9216389656066895, + "logits/rejected": -2.8908047676086426, + "logps/chosen": -45.96767807006836, + "logps/rejected": -206.0484161376953, + "loss": 0.1433, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2884276807308197, + "rewards/margins": 1.9038997888565063, + "rewards/rejected": -1.6154720783233643, + "step": 920 + }, + { + "epoch": 0.06, + "learning_rate": 2.772808586762075e-06, + "logits/chosen": -2.8979265689849854, + "logits/rejected": -2.8431503772735596, + "logps/chosen": -49.884559631347656, + "logps/rejected": -198.9449005126953, + "loss": 0.1568, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25000572204589844, + "rewards/margins": 1.7875381708145142, + "rewards/rejected": -1.5375325679779053, + "step": 930 + }, + { + "epoch": 0.06, + "learning_rate": 2.802623732856291e-06, + "logits/chosen": -2.906466245651245, + "logits/rejected": -2.8588998317718506, + "logps/chosen": -48.54482650756836, + "logps/rejected": -213.2858123779297, + "loss": 0.137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22776472568511963, + "rewards/margins": 1.9271682500839233, + "rewards/rejected": -1.6994035243988037, + "step": 940 + }, + { + "epoch": 0.06, + "learning_rate": 2.832438878950507e-06, + "logits/chosen": -2.932741641998291, + "logits/rejected": -2.8750081062316895, + "logps/chosen": -43.689491271972656, + "logps/rejected": -205.803955078125, + "loss": 0.1513, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19723446667194366, + "rewards/margins": 1.818921446800232, + "rewards/rejected": -1.6216869354248047, + "step": 950 + }, + { + "epoch": 0.06, + "learning_rate": 2.862254025044723e-06, + "logits/chosen": -2.927215099334717, + "logits/rejected": -2.8479530811309814, + "logps/chosen": -48.602638244628906, + "logps/rejected": -216.87960815429688, + "loss": 0.1434, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22594816982746124, + "rewards/margins": 1.9465906620025635, + "rewards/rejected": -1.7206424474716187, + "step": 960 + }, + { + "epoch": 0.06, + "learning_rate": 2.8920691711389387e-06, + "logits/chosen": -2.9289841651916504, + "logits/rejected": -2.8292651176452637, + "logps/chosen": -46.494232177734375, + "logps/rejected": -209.7342987060547, + "loss": 0.1407, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20923173427581787, + "rewards/margins": 1.8711140155792236, + "rewards/rejected": -1.6618821620941162, + "step": 970 + }, + { + "epoch": 0.06, + "learning_rate": 2.9218843172331544e-06, + "logits/chosen": -2.9083921909332275, + "logits/rejected": -2.9028637409210205, + "logps/chosen": -54.48478317260742, + "logps/rejected": -215.77587890625, + "loss": 0.1418, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18314385414123535, + "rewards/margins": 1.8907610177993774, + "rewards/rejected": -1.7076170444488525, + "step": 980 + }, + { + "epoch": 0.06, + "learning_rate": 2.9516994633273705e-06, + "logits/chosen": -2.917245864868164, + "logits/rejected": -2.8610386848449707, + "logps/chosen": -52.58135223388672, + "logps/rejected": -220.6952362060547, + "loss": 0.1329, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1814514845609665, + "rewards/margins": 1.937570333480835, + "rewards/rejected": -1.7561187744140625, + "step": 990 + }, + { + "epoch": 0.06, + "learning_rate": 2.981514609421586e-06, + "logits/chosen": -2.9567294120788574, + "logits/rejected": -2.8885066509246826, + "logps/chosen": -51.50627517700195, + "logps/rejected": -227.5182647705078, + "loss": 0.1303, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1922745406627655, + "rewards/margins": 2.0097155570983887, + "rewards/rejected": -1.8174407482147217, + "step": 1000 + }, + { + "epoch": 0.06, + "learning_rate": 3.0113297555158023e-06, + "logits/chosen": -2.934863567352295, + "logits/rejected": -2.8902747631073, + "logps/chosen": -50.67198181152344, + "logps/rejected": -222.57131958007812, + "loss": 0.1366, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18493175506591797, + "rewards/margins": 1.980546236038208, + "rewards/rejected": -1.7956146001815796, + "step": 1010 + }, + { + "epoch": 0.06, + "learning_rate": 3.041144901610018e-06, + "logits/chosen": -2.9291841983795166, + "logits/rejected": -2.8428688049316406, + "logps/chosen": -49.201480865478516, + "logps/rejected": -232.1763458251953, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20666006207466125, + "rewards/margins": 2.089517593383789, + "rewards/rejected": -1.882857322692871, + "step": 1020 + }, + { + "epoch": 0.06, + "learning_rate": 3.070960047704234e-06, + "logits/chosen": -2.949662446975708, + "logits/rejected": -2.8584136962890625, + "logps/chosen": -50.243751525878906, + "logps/rejected": -232.68533325195312, + "loss": 0.1221, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16892682015895844, + "rewards/margins": 2.0424437522888184, + "rewards/rejected": -1.8735166788101196, + "step": 1030 + }, + { + "epoch": 0.06, + "learning_rate": 3.10077519379845e-06, + "logits/chosen": -2.9093775749206543, + "logits/rejected": -2.820742607116699, + "logps/chosen": -53.60634231567383, + "logps/rejected": -228.4220733642578, + "loss": 0.1272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1871906816959381, + "rewards/margins": 2.0358026027679443, + "rewards/rejected": -1.848611831665039, + "step": 1040 + }, + { + "epoch": 0.06, + "learning_rate": 3.1305903398926655e-06, + "logits/chosen": -2.9516549110412598, + "logits/rejected": -2.8534727096557617, + "logps/chosen": -52.65508270263672, + "logps/rejected": -245.27963256835938, + "loss": 0.1129, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19569073617458344, + "rewards/margins": 2.213348627090454, + "rewards/rejected": -2.01765775680542, + "step": 1050 + }, + { + "epoch": 0.06, + "learning_rate": 3.1604054859868816e-06, + "logits/chosen": -2.925849199295044, + "logits/rejected": -2.821476936340332, + "logps/chosen": -55.7864990234375, + "logps/rejected": -251.8877716064453, + "loss": 0.1093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13442271947860718, + "rewards/margins": 2.21010684967041, + "rewards/rejected": -2.075684070587158, + "step": 1060 + }, + { + "epoch": 0.06, + "learning_rate": 3.1902206320810973e-06, + "logits/chosen": -2.918884754180908, + "logits/rejected": -2.8218636512756348, + "logps/chosen": -53.832305908203125, + "logps/rejected": -248.07412719726562, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16307690739631653, + "rewards/margins": 2.2009968757629395, + "rewards/rejected": -2.0379199981689453, + "step": 1070 + }, + { + "epoch": 0.06, + "learning_rate": 3.2200357781753134e-06, + "logits/chosen": -2.913417100906372, + "logits/rejected": -2.8137714862823486, + "logps/chosen": -61.895362854003906, + "logps/rejected": -257.60272216796875, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12998700141906738, + "rewards/margins": 2.2668581008911133, + "rewards/rejected": -2.136870861053467, + "step": 1080 + }, + { + "epoch": 0.06, + "learning_rate": 3.249850924269529e-06, + "logits/chosen": -2.9006271362304688, + "logits/rejected": -2.8202404975891113, + "logps/chosen": -62.01024627685547, + "logps/rejected": -253.77981567382812, + "loss": 0.1087, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12107900530099869, + "rewards/margins": 2.221971035003662, + "rewards/rejected": -2.1008923053741455, + "step": 1090 + }, + { + "epoch": 0.07, + "learning_rate": 3.2796660703637452e-06, + "logits/chosen": -2.9391419887542725, + "logits/rejected": -2.8563625812530518, + "logps/chosen": -60.92161178588867, + "logps/rejected": -277.9583435058594, + "loss": 0.1016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07013644278049469, + "rewards/margins": 2.408625841140747, + "rewards/rejected": -2.338489055633545, + "step": 1100 + }, + { + "epoch": 0.07, + "learning_rate": 3.309481216457961e-06, + "logits/chosen": -2.936723232269287, + "logits/rejected": -2.809131145477295, + "logps/chosen": -63.90967559814453, + "logps/rejected": -287.70257568359375, + "loss": 0.0921, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07646025717258453, + "rewards/margins": 2.511096477508545, + "rewards/rejected": -2.434635877609253, + "step": 1110 + }, + { + "epoch": 0.07, + "learning_rate": 3.3392963625521766e-06, + "logits/chosen": -2.854280948638916, + "logits/rejected": -2.7536861896514893, + "logps/chosen": -69.69160461425781, + "logps/rejected": -300.3216247558594, + "loss": 0.0907, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005916008725762367, + "rewards/margins": 2.5596814155578613, + "rewards/rejected": -2.553765296936035, + "step": 1120 + }, + { + "epoch": 0.07, + "learning_rate": 3.3691115086463927e-06, + "logits/chosen": -2.911325693130493, + "logits/rejected": -2.7586543560028076, + "logps/chosen": -82.3992919921875, + "logps/rejected": -371.47528076171875, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1462583988904953, + "rewards/margins": 3.1211297512054443, + "rewards/rejected": -3.267388105392456, + "step": 1130 + }, + { + "epoch": 0.07, + "learning_rate": 3.3989266547406084e-06, + "logits/chosen": -2.8932929039001465, + "logits/rejected": -2.788323402404785, + "logps/chosen": -89.82049560546875, + "logps/rejected": -368.25177001953125, + "loss": 0.0729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23859450221061707, + "rewards/margins": 3.0021443367004395, + "rewards/rejected": -3.240739107131958, + "step": 1140 + }, + { + "epoch": 0.07, + "learning_rate": 3.4287418008348246e-06, + "logits/chosen": -2.913886308670044, + "logits/rejected": -2.7686142921447754, + "logps/chosen": -118.69229888916016, + "logps/rejected": -467.5948181152344, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45577478408813477, + "rewards/margins": 3.7719509601593018, + "rewards/rejected": -4.227725505828857, + "step": 1150 + }, + { + "epoch": 0.07, + "learning_rate": 3.4585569469290402e-06, + "logits/chosen": -2.889310836791992, + "logits/rejected": -2.7985191345214844, + "logps/chosen": -74.08802795410156, + "logps/rejected": -435.2679138183594, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025621002539992332, + "rewards/margins": 3.877232313156128, + "rewards/rejected": -3.902853012084961, + "step": 1160 + }, + { + "epoch": 0.07, + "learning_rate": 3.4883720930232564e-06, + "logits/chosen": -2.858800172805786, + "logits/rejected": -2.7574596405029297, + "logps/chosen": -122.84834289550781, + "logps/rejected": -501.29638671875, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49279889464378357, + "rewards/margins": 4.067281246185303, + "rewards/rejected": -4.560080051422119, + "step": 1170 + }, + { + "epoch": 0.07, + "learning_rate": 3.518187239117472e-06, + "logits/chosen": -2.86140513420105, + "logits/rejected": -2.772339105606079, + "logps/chosen": -101.2002944946289, + "logps/rejected": -492.4452209472656, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2555873990058899, + "rewards/margins": 4.228099346160889, + "rewards/rejected": -4.483687400817871, + "step": 1180 + }, + { + "epoch": 0.07, + "learning_rate": 3.5480023852116878e-06, + "logits/chosen": -2.907498359680176, + "logits/rejected": -2.7997279167175293, + "logps/chosen": -126.2753677368164, + "logps/rejected": -543.5437622070312, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5438970923423767, + "rewards/margins": 4.444826126098633, + "rewards/rejected": -4.988723278045654, + "step": 1190 + }, + { + "epoch": 0.07, + "learning_rate": 3.577817531305904e-06, + "logits/chosen": -2.8707385063171387, + "logits/rejected": -2.7878029346466064, + "logps/chosen": -102.17366027832031, + "logps/rejected": -607.9171142578125, + "loss": 0.0312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3222612738609314, + "rewards/margins": 5.321918964385986, + "rewards/rejected": -5.644179344177246, + "step": 1200 + }, + { + "epoch": 0.07, + "learning_rate": 3.6076326774001196e-06, + "logits/chosen": -2.9233720302581787, + "logits/rejected": -2.814185380935669, + "logps/chosen": -80.7733383178711, + "logps/rejected": -581.2581176757812, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05069941282272339, + "rewards/margins": 5.321071147918701, + "rewards/rejected": -5.371769905090332, + "step": 1210 + }, + { + "epoch": 0.07, + "learning_rate": 3.6374478234943357e-06, + "logits/chosen": -2.8979592323303223, + "logits/rejected": -2.8126442432403564, + "logps/chosen": -99.09781646728516, + "logps/rejected": -586.7628173828125, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25345659255981445, + "rewards/margins": 5.178110122680664, + "rewards/rejected": -5.431567192077637, + "step": 1220 + }, + { + "epoch": 0.07, + "learning_rate": 3.6672629695885514e-06, + "logits/chosen": -2.8876588344573975, + "logits/rejected": -2.7811882495880127, + "logps/chosen": -105.880615234375, + "logps/rejected": -553.8966064453125, + "loss": 0.0667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3590214252471924, + "rewards/margins": 4.745104789733887, + "rewards/rejected": -5.1041259765625, + "step": 1230 + }, + { + "epoch": 0.07, + "learning_rate": 3.6970781156827675e-06, + "logits/chosen": -2.906069040298462, + "logits/rejected": -2.804664134979248, + "logps/chosen": -103.67093658447266, + "logps/rejected": -587.0574340820312, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3086903691291809, + "rewards/margins": 5.122992515563965, + "rewards/rejected": -5.431683540344238, + "step": 1240 + }, + { + "epoch": 0.07, + "learning_rate": 3.726893261776983e-06, + "logits/chosen": -2.8762924671173096, + "logits/rejected": -2.797161817550659, + "logps/chosen": -99.7688980102539, + "logps/rejected": -596.4459228515625, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33934131264686584, + "rewards/margins": 5.174071788787842, + "rewards/rejected": -5.513413906097412, + "step": 1250 + }, + { + "epoch": 0.08, + "learning_rate": 3.756708407871199e-06, + "logits/chosen": -2.942321538925171, + "logits/rejected": -2.8218114376068115, + "logps/chosen": -107.68839263916016, + "logps/rejected": -625.3644409179688, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3565952777862549, + "rewards/margins": 5.4525322914123535, + "rewards/rejected": -5.809127330780029, + "step": 1260 + }, + { + "epoch": 0.08, + "learning_rate": 3.786523553965415e-06, + "logits/chosen": -2.911815881729126, + "logits/rejected": -2.770745038986206, + "logps/chosen": -85.6732177734375, + "logps/rejected": -657.2732543945312, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1061444878578186, + "rewards/margins": 6.0360918045043945, + "rewards/rejected": -6.14223575592041, + "step": 1270 + }, + { + "epoch": 0.08, + "learning_rate": 3.816338700059631e-06, + "logits/chosen": -2.886545181274414, + "logits/rejected": -2.734433650970459, + "logps/chosen": -115.2284927368164, + "logps/rejected": -633.8414306640625, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40496665239334106, + "rewards/margins": 5.501077651977539, + "rewards/rejected": -5.906044006347656, + "step": 1280 + }, + { + "epoch": 0.08, + "learning_rate": 3.846153846153847e-06, + "logits/chosen": -2.8823819160461426, + "logits/rejected": -2.7652571201324463, + "logps/chosen": -128.81338500976562, + "logps/rejected": -752.5650634765625, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5743337869644165, + "rewards/margins": 6.505009651184082, + "rewards/rejected": -7.079343318939209, + "step": 1290 + }, + { + "epoch": 0.08, + "learning_rate": 3.875968992248063e-06, + "logits/chosen": -2.888608932495117, + "logits/rejected": -2.8110814094543457, + "logps/chosen": -114.83067321777344, + "logps/rejected": -707.441650390625, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4642557203769684, + "rewards/margins": 6.154486179351807, + "rewards/rejected": -6.6187424659729, + "step": 1300 + }, + { + "epoch": 0.08, + "learning_rate": 3.905784138342278e-06, + "logits/chosen": -2.928072452545166, + "logits/rejected": -2.810481548309326, + "logps/chosen": -145.2091522216797, + "logps/rejected": -704.4346923828125, + "loss": 0.0342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6881845593452454, + "rewards/margins": 5.919460773468018, + "rewards/rejected": -6.607645511627197, + "step": 1310 + }, + { + "epoch": 0.08, + "learning_rate": 3.935599284436494e-06, + "logits/chosen": -2.868002414703369, + "logits/rejected": -2.765359401702881, + "logps/chosen": -131.2050323486328, + "logps/rejected": -707.6498413085938, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6056081056594849, + "rewards/margins": 6.0184478759765625, + "rewards/rejected": -6.6240553855896, + "step": 1320 + }, + { + "epoch": 0.08, + "learning_rate": 3.96541443053071e-06, + "logits/chosen": -2.885503053665161, + "logits/rejected": -2.8132071495056152, + "logps/chosen": -112.09709167480469, + "logps/rejected": -741.6683349609375, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36491528153419495, + "rewards/margins": 6.602360725402832, + "rewards/rejected": -6.967276096343994, + "step": 1330 + }, + { + "epoch": 0.08, + "learning_rate": 3.995229576624926e-06, + "logits/chosen": -2.9085755348205566, + "logits/rejected": -2.835230827331543, + "logps/chosen": -108.37870025634766, + "logps/rejected": -705.7604370117188, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4209515154361725, + "rewards/margins": 6.193148136138916, + "rewards/rejected": -6.614099025726318, + "step": 1340 + }, + { + "epoch": 0.08, + "learning_rate": 4.025044722719142e-06, + "logits/chosen": -2.905709743499756, + "logits/rejected": -2.8284640312194824, + "logps/chosen": -91.01869201660156, + "logps/rejected": -756.5321044921875, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20456485450267792, + "rewards/margins": 6.914083957672119, + "rewards/rejected": -7.118648529052734, + "step": 1350 + }, + { + "epoch": 0.08, + "learning_rate": 4.054859868813357e-06, + "logits/chosen": -2.9096837043762207, + "logits/rejected": -2.811187267303467, + "logps/chosen": -93.60760498046875, + "logps/rejected": -772.1401977539062, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17053982615470886, + "rewards/margins": 7.10394287109375, + "rewards/rejected": -7.274481296539307, + "step": 1360 + }, + { + "epoch": 0.08, + "learning_rate": 4.084675014907573e-06, + "logits/chosen": -2.907258987426758, + "logits/rejected": -2.794480323791504, + "logps/chosen": -85.34244537353516, + "logps/rejected": -773.3317260742188, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20540836453437805, + "rewards/margins": 7.082415580749512, + "rewards/rejected": -7.2878241539001465, + "step": 1370 + }, + { + "epoch": 0.08, + "learning_rate": 4.114490161001789e-06, + "logits/chosen": -2.8900818824768066, + "logits/rejected": -2.8205857276916504, + "logps/chosen": -112.50643157958984, + "logps/rejected": -770.963623046875, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44011083245277405, + "rewards/margins": 6.821206569671631, + "rewards/rejected": -7.261316776275635, + "step": 1380 + }, + { + "epoch": 0.08, + "learning_rate": 4.1443053070960046e-06, + "logits/chosen": -2.8994557857513428, + "logits/rejected": -2.8128392696380615, + "logps/chosen": -119.0394515991211, + "logps/rejected": -781.0758666992188, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48783254623413086, + "rewards/margins": 6.872071743011475, + "rewards/rejected": -7.359903812408447, + "step": 1390 + }, + { + "epoch": 0.08, + "learning_rate": 4.174120453190221e-06, + "logits/chosen": -2.9006454944610596, + "logits/rejected": -2.8404648303985596, + "logps/chosen": -85.8052978515625, + "logps/rejected": -690.7034912109375, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17774322628974915, + "rewards/margins": 6.288405895233154, + "rewards/rejected": -6.466148376464844, + "step": 1400 + }, + { + "epoch": 0.08, + "learning_rate": 4.203935599284437e-06, + "logits/chosen": -2.9320194721221924, + "logits/rejected": -2.8243796825408936, + "logps/chosen": -136.13308715820312, + "logps/rejected": -770.6847534179688, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.697273850440979, + "rewards/margins": 6.55682373046875, + "rewards/rejected": -7.254096984863281, + "step": 1410 + }, + { + "epoch": 0.08, + "learning_rate": 4.233750745378653e-06, + "logits/chosen": -2.917206287384033, + "logits/rejected": -2.795807123184204, + "logps/chosen": -149.92236328125, + "logps/rejected": -819.3771362304688, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7758058309555054, + "rewards/margins": 6.977842807769775, + "rewards/rejected": -7.753647804260254, + "step": 1420 + }, + { + "epoch": 0.09, + "learning_rate": 4.263565891472868e-06, + "logits/chosen": -2.908297538757324, + "logits/rejected": -2.8062705993652344, + "logps/chosen": -100.38211059570312, + "logps/rejected": -749.2584838867188, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32293376326560974, + "rewards/margins": 6.71783971786499, + "rewards/rejected": -7.040773868560791, + "step": 1430 + }, + { + "epoch": 0.09, + "learning_rate": 4.293381037567084e-06, + "logits/chosen": -2.919139862060547, + "logits/rejected": -2.817054271697998, + "logps/chosen": -138.50921630859375, + "logps/rejected": -812.5184326171875, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6169862151145935, + "rewards/margins": 7.054887294769287, + "rewards/rejected": -7.671874046325684, + "step": 1440 + }, + { + "epoch": 0.09, + "learning_rate": 4.3231961836613e-06, + "logits/chosen": -2.8928654193878174, + "logits/rejected": -2.81543231010437, + "logps/chosen": -135.01760864257812, + "logps/rejected": -755.3966064453125, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5882952809333801, + "rewards/margins": 6.516493320465088, + "rewards/rejected": -7.104788303375244, + "step": 1450 + }, + { + "epoch": 0.09, + "learning_rate": 4.353011329755516e-06, + "logits/chosen": -2.9313805103302, + "logits/rejected": -2.8082115650177, + "logps/chosen": -94.20889282226562, + "logps/rejected": -718.9976806640625, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2442726343870163, + "rewards/margins": 6.521415710449219, + "rewards/rejected": -6.765688896179199, + "step": 1460 + }, + { + "epoch": 0.09, + "learning_rate": 4.382826475849732e-06, + "logits/chosen": -2.950016736984253, + "logits/rejected": -2.8385097980499268, + "logps/chosen": -95.37654113769531, + "logps/rejected": -746.2007446289062, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21195659041404724, + "rewards/margins": 6.812326908111572, + "rewards/rejected": -7.024283409118652, + "step": 1470 + }, + { + "epoch": 0.09, + "learning_rate": 4.412641621943948e-06, + "logits/chosen": -2.909172773361206, + "logits/rejected": -2.8274030685424805, + "logps/chosen": -115.06422424316406, + "logps/rejected": -707.6343994140625, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4683963358402252, + "rewards/margins": 6.17194128036499, + "rewards/rejected": -6.640337944030762, + "step": 1480 + }, + { + "epoch": 0.09, + "learning_rate": 4.442456768038164e-06, + "logits/chosen": -2.9105584621429443, + "logits/rejected": -2.8290228843688965, + "logps/chosen": -100.8877944946289, + "logps/rejected": -761.8911743164062, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29536038637161255, + "rewards/margins": 6.875528812408447, + "rewards/rejected": -7.170888423919678, + "step": 1490 + }, + { + "epoch": 0.09, + "learning_rate": 4.472271914132379e-06, + "logits/chosen": -2.936131000518799, + "logits/rejected": -2.860999822616577, + "logps/chosen": -126.1532211303711, + "logps/rejected": -755.7017822265625, + "loss": 0.045, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5298954844474792, + "rewards/margins": 6.5799736976623535, + "rewards/rejected": -7.109869480133057, + "step": 1500 + }, + { + "epoch": 0.09, + "learning_rate": 4.502087060226595e-06, + "logits/chosen": -2.923811435699463, + "logits/rejected": -2.8051819801330566, + "logps/chosen": -126.59808349609375, + "logps/rejected": -834.5745239257812, + "loss": 0.0298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5441330075263977, + "rewards/margins": 7.3602800369262695, + "rewards/rejected": -7.904412269592285, + "step": 1510 + }, + { + "epoch": 0.09, + "learning_rate": 4.5319022063208115e-06, + "logits/chosen": -2.9029436111450195, + "logits/rejected": -2.793008804321289, + "logps/chosen": -110.28570556640625, + "logps/rejected": -774.4368896484375, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3923500180244446, + "rewards/margins": 6.911857604980469, + "rewards/rejected": -7.304207801818848, + "step": 1520 + }, + { + "epoch": 0.09, + "learning_rate": 4.561717352415027e-06, + "logits/chosen": -2.934405565261841, + "logits/rejected": -2.836610794067383, + "logps/chosen": -102.74562072753906, + "logps/rejected": -755.12646484375, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28946709632873535, + "rewards/margins": 6.826741695404053, + "rewards/rejected": -7.116208553314209, + "step": 1530 + }, + { + "epoch": 0.09, + "learning_rate": 4.591532498509243e-06, + "logits/chosen": -2.924065351486206, + "logits/rejected": -2.847355365753174, + "logps/chosen": -92.72177124023438, + "logps/rejected": -743.5125122070312, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21991023421287537, + "rewards/margins": 6.7882537841796875, + "rewards/rejected": -7.008164882659912, + "step": 1540 + }, + { + "epoch": 0.09, + "learning_rate": 4.621347644603459e-06, + "logits/chosen": -2.929433822631836, + "logits/rejected": -2.852585554122925, + "logps/chosen": -81.76838684082031, + "logps/rejected": -776.5368041992188, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1782989203929901, + "rewards/margins": 7.1535186767578125, + "rewards/rejected": -7.33181619644165, + "step": 1550 + }, + { + "epoch": 0.09, + "learning_rate": 4.651162790697675e-06, + "logits/chosen": -2.8872294425964355, + "logits/rejected": -2.804154634475708, + "logps/chosen": -106.18524169921875, + "logps/rejected": -862.4391479492188, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40572699904441833, + "rewards/margins": 7.777975559234619, + "rewards/rejected": -8.18370246887207, + "step": 1560 + }, + { + "epoch": 0.09, + "learning_rate": 4.68097793679189e-06, + "logits/chosen": -2.916672945022583, + "logits/rejected": -2.8257951736450195, + "logps/chosen": -125.31349182128906, + "logps/rejected": -861.0426635742188, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.524355947971344, + "rewards/margins": 7.647345542907715, + "rewards/rejected": -8.171703338623047, + "step": 1570 + }, + { + "epoch": 0.09, + "learning_rate": 4.7107930828861065e-06, + "logits/chosen": -2.9099411964416504, + "logits/rejected": -2.829416036605835, + "logps/chosen": -82.06556701660156, + "logps/rejected": -796.6558227539062, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12387046962976456, + "rewards/margins": 7.402307033538818, + "rewards/rejected": -7.526177406311035, + "step": 1580 + }, + { + "epoch": 0.09, + "learning_rate": 4.740608228980323e-06, + "logits/chosen": -2.9172427654266357, + "logits/rejected": -2.79081392288208, + "logps/chosen": -98.95271301269531, + "logps/rejected": -797.7516479492188, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25947898626327515, + "rewards/margins": 7.2698259353637695, + "rewards/rejected": -7.529304504394531, + "step": 1590 + }, + { + "epoch": 0.1, + "learning_rate": 4.770423375074538e-06, + "logits/chosen": -2.9234654903411865, + "logits/rejected": -2.826357126235962, + "logps/chosen": -69.31069946289062, + "logps/rejected": -835.509765625, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00625277915969491, + "rewards/margins": 7.899542808532715, + "rewards/rejected": -7.905796051025391, + "step": 1600 + }, + { + "epoch": 0.1, + "learning_rate": 4.800238521168754e-06, + "logits/chosen": -2.9441840648651123, + "logits/rejected": -2.8595235347747803, + "logps/chosen": -77.70034790039062, + "logps/rejected": -752.5096435546875, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003330943640321493, + "rewards/margins": 7.0750555992126465, + "rewards/rejected": -7.078387260437012, + "step": 1610 + }, + { + "epoch": 0.1, + "learning_rate": 4.83005366726297e-06, + "logits/chosen": -2.9434502124786377, + "logits/rejected": -2.797577381134033, + "logps/chosen": -85.08970642089844, + "logps/rejected": -765.79345703125, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15180622041225433, + "rewards/margins": 7.069903373718262, + "rewards/rejected": -7.22170877456665, + "step": 1620 + }, + { + "epoch": 0.1, + "learning_rate": 4.859868813357186e-06, + "logits/chosen": -2.904468297958374, + "logits/rejected": -2.7911434173583984, + "logps/chosen": -98.65773010253906, + "logps/rejected": -755.7101440429688, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2950167953968048, + "rewards/margins": 6.821331977844238, + "rewards/rejected": -7.1163482666015625, + "step": 1630 + }, + { + "epoch": 0.1, + "learning_rate": 4.8896839594514015e-06, + "logits/chosen": -2.9521899223327637, + "logits/rejected": -2.8706722259521484, + "logps/chosen": -75.16636657714844, + "logps/rejected": -748.5827026367188, + "loss": 0.0594, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.06359346210956573, + "rewards/margins": 6.986341953277588, + "rewards/rejected": -7.049934387207031, + "step": 1640 + }, + { + "epoch": 0.1, + "learning_rate": 4.919499105545618e-06, + "logits/chosen": -2.9369125366210938, + "logits/rejected": -2.833885431289673, + "logps/chosen": -92.69554901123047, + "logps/rejected": -756.6004638671875, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22647139430046082, + "rewards/margins": 6.897830009460449, + "rewards/rejected": -7.124301910400391, + "step": 1650 + }, + { + "epoch": 0.1, + "learning_rate": 4.949314251639834e-06, + "logits/chosen": -2.9366354942321777, + "logits/rejected": -2.8257880210876465, + "logps/chosen": -78.64491271972656, + "logps/rejected": -820.7420043945312, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06685201078653336, + "rewards/margins": 7.706122398376465, + "rewards/rejected": -7.772973537445068, + "step": 1660 + }, + { + "epoch": 0.1, + "learning_rate": 4.979129397734049e-06, + "logits/chosen": -2.905447483062744, + "logits/rejected": -2.829411745071411, + "logps/chosen": -105.19095611572266, + "logps/rejected": -778.275146484375, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3830556273460388, + "rewards/margins": 6.958500862121582, + "rewards/rejected": -7.341555595397949, + "step": 1670 + }, + { + "epoch": 0.1, + "learning_rate": 4.99999951258251e-06, + "logits/chosen": -2.930213451385498, + "logits/rejected": -2.881126880645752, + "logps/chosen": -93.16279602050781, + "logps/rejected": -832.3576049804688, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22098806500434875, + "rewards/margins": 7.65737771987915, + "rewards/rejected": -7.878365993499756, + "step": 1680 + }, + { + "epoch": 0.1, + "learning_rate": 4.9999908473879605e-06, + "logits/chosen": -2.923370599746704, + "logits/rejected": -2.83423113822937, + "logps/chosen": -87.92120361328125, + "logps/rejected": -878.4290161132812, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18133951723575592, + "rewards/margins": 8.165082931518555, + "rewards/rejected": -8.346423149108887, + "step": 1690 + }, + { + "epoch": 0.1, + "learning_rate": 4.999971350736829e-06, + "logits/chosen": -2.905275583267212, + "logits/rejected": -2.8191068172454834, + "logps/chosen": -128.41648864746094, + "logps/rejected": -831.4952392578125, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5878888368606567, + "rewards/margins": 7.279034614562988, + "rewards/rejected": -7.866921901702881, + "step": 1700 + }, + { + "epoch": 0.1, + "learning_rate": 4.999941022713586e-06, + "logits/chosen": -2.909315586090088, + "logits/rejected": -2.8059866428375244, + "logps/chosen": -91.888671875, + "logps/rejected": -915.4306640625, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2655971348285675, + "rewards/margins": 8.438061714172363, + "rewards/rejected": -8.703659057617188, + "step": 1710 + }, + { + "epoch": 0.1, + "learning_rate": 4.999899863449631e-06, + "logits/chosen": -2.9266390800476074, + "logits/rejected": -2.819002866744995, + "logps/chosen": -105.3461685180664, + "logps/rejected": -842.1634521484375, + "loss": 0.0392, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.33604034781455994, + "rewards/margins": 7.634359836578369, + "rewards/rejected": -7.970399379730225, + "step": 1720 + }, + { + "epoch": 0.1, + "learning_rate": 4.999847873123291e-06, + "logits/chosen": -2.945405960083008, + "logits/rejected": -2.839404582977295, + "logps/chosen": -77.4031753540039, + "logps/rejected": -859.71923828125, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07228974997997284, + "rewards/margins": 8.0891752243042, + "rewards/rejected": -8.16146469116211, + "step": 1730 + }, + { + "epoch": 0.1, + "learning_rate": 4.999785051959819e-06, + "logits/chosen": -2.927288770675659, + "logits/rejected": -2.8343400955200195, + "logps/chosen": -80.38436126708984, + "logps/rejected": -812.8990478515625, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05866802856326103, + "rewards/margins": 7.646407127380371, + "rewards/rejected": -7.705076694488525, + "step": 1740 + }, + { + "epoch": 0.1, + "learning_rate": 4.999711400231393e-06, + "logits/chosen": -2.949894666671753, + "logits/rejected": -2.851357936859131, + "logps/chosen": -72.51522064208984, + "logps/rejected": -967.3624267578125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01461850292980671, + "rewards/margins": 9.207165718078613, + "rewards/rejected": -9.221784591674805, + "step": 1750 + }, + { + "epoch": 0.1, + "learning_rate": 4.999626918257117e-06, + "logits/chosen": -2.9162895679473877, + "logits/rejected": -2.824445962905884, + "logps/chosen": -91.44669342041016, + "logps/rejected": -881.5900268554688, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25392740964889526, + "rewards/margins": 8.122617721557617, + "rewards/rejected": -8.376545906066895, + "step": 1760 + }, + { + "epoch": 0.11, + "learning_rate": 4.999531606403018e-06, + "logits/chosen": -2.8978888988494873, + "logits/rejected": -2.8125948905944824, + "logps/chosen": -91.75733947753906, + "logps/rejected": -820.8654174804688, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2465338408946991, + "rewards/margins": 7.528445243835449, + "rewards/rejected": -7.774979591369629, + "step": 1770 + }, + { + "epoch": 0.11, + "learning_rate": 4.999425465082043e-06, + "logits/chosen": -2.893117904663086, + "logits/rejected": -2.79646897315979, + "logps/chosen": -111.27781677246094, + "logps/rejected": -860.5802612304688, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3445555567741394, + "rewards/margins": 7.822633266448975, + "rewards/rejected": -8.16718864440918, + "step": 1780 + }, + { + "epoch": 0.11, + "learning_rate": 4.99930849475406e-06, + "logits/chosen": -2.9206490516662598, + "logits/rejected": -2.8516833782196045, + "logps/chosen": -78.2650146484375, + "logps/rejected": -941.5294799804688, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08792293071746826, + "rewards/margins": 8.884671211242676, + "rewards/rejected": -8.972593307495117, + "step": 1790 + }, + { + "epoch": 0.11, + "learning_rate": 4.999180695925856e-06, + "logits/chosen": -2.953122138977051, + "logits/rejected": -2.8829965591430664, + "logps/chosen": -85.0573501586914, + "logps/rejected": -814.1541137695312, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16442376375198364, + "rewards/margins": 7.528973579406738, + "rewards/rejected": -7.693397521972656, + "step": 1800 + }, + { + "epoch": 0.11, + "learning_rate": 4.999042069151129e-06, + "logits/chosen": -2.916384220123291, + "logits/rejected": -2.8653922080993652, + "logps/chosen": -69.55900573730469, + "logps/rejected": -865.80322265625, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04656394198536873, + "rewards/margins": 8.165156364440918, + "rewards/rejected": -8.21172046661377, + "step": 1810 + }, + { + "epoch": 0.11, + "learning_rate": 4.998892615030496e-06, + "logits/chosen": -2.9309935569763184, + "logits/rejected": -2.868135929107666, + "logps/chosen": -79.2657241821289, + "logps/rejected": -772.4255981445312, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10171637684106827, + "rewards/margins": 7.189272403717041, + "rewards/rejected": -7.290989875793457, + "step": 1820 + }, + { + "epoch": 0.11, + "learning_rate": 4.99873233421148e-06, + "logits/chosen": -2.9083075523376465, + "logits/rejected": -2.7826335430145264, + "logps/chosen": -97.53350830078125, + "logps/rejected": -898.1552734375, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2526090145111084, + "rewards/margins": 8.289029121398926, + "rewards/rejected": -8.541638374328613, + "step": 1830 + }, + { + "epoch": 0.11, + "learning_rate": 4.9985612273885145e-06, + "logits/chosen": -2.937445640563965, + "logits/rejected": -2.850409984588623, + "logps/chosen": -88.12530517578125, + "logps/rejected": -842.4501953125, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15504731237888336, + "rewards/margins": 7.8274126052856445, + "rewards/rejected": -7.98245906829834, + "step": 1840 + }, + { + "epoch": 0.11, + "learning_rate": 4.998379295302936e-06, + "logits/chosen": -2.898979663848877, + "logits/rejected": -2.8027191162109375, + "logps/chosen": -75.83100891113281, + "logps/rejected": -893.0339965820312, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09177707880735397, + "rewards/margins": 8.395233154296875, + "rewards/rejected": -8.48701000213623, + "step": 1850 + }, + { + "epoch": 0.11, + "learning_rate": 4.9981865387429825e-06, + "logits/chosen": -2.9434962272644043, + "logits/rejected": -2.8376216888427734, + "logps/chosen": -94.00227355957031, + "logps/rejected": -738.0641479492188, + "loss": 0.0366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24040238559246063, + "rewards/margins": 6.678671360015869, + "rewards/rejected": -6.91907262802124, + "step": 1860 + }, + { + "epoch": 0.11, + "learning_rate": 4.997982958543792e-06, + "logits/chosen": -2.910454750061035, + "logits/rejected": -2.7979977130889893, + "logps/chosen": -90.28697204589844, + "logps/rejected": -1013.6444091796875, + "loss": 0.044, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17328914999961853, + "rewards/margins": 9.517983436584473, + "rewards/rejected": -9.691271781921387, + "step": 1870 + }, + { + "epoch": 0.11, + "learning_rate": 4.9977685555873955e-06, + "logits/chosen": -2.94758939743042, + "logits/rejected": -2.834099054336548, + "logps/chosen": -104.4254379272461, + "logps/rejected": -839.0380859375, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3154955804347992, + "rewards/margins": 7.631292819976807, + "rewards/rejected": -7.946788787841797, + "step": 1880 + }, + { + "epoch": 0.11, + "learning_rate": 4.997543330802716e-06, + "logits/chosen": -2.9423186779022217, + "logits/rejected": -2.8269755840301514, + "logps/chosen": -143.88519287109375, + "logps/rejected": -954.8709106445312, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7201865911483765, + "rewards/margins": 8.379110336303711, + "rewards/rejected": -9.099296569824219, + "step": 1890 + }, + { + "epoch": 0.11, + "learning_rate": 4.997307285165559e-06, + "logits/chosen": -2.900526285171509, + "logits/rejected": -2.7858831882476807, + "logps/chosen": -165.40724182128906, + "logps/rejected": -843.07958984375, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9333111047744751, + "rewards/margins": 7.0505547523498535, + "rewards/rejected": -7.983866214752197, + "step": 1900 + }, + { + "epoch": 0.11, + "learning_rate": 4.997060419698618e-06, + "logits/chosen": -2.90938663482666, + "logits/rejected": -2.806380033493042, + "logps/chosen": -125.01261138916016, + "logps/rejected": -848.7537841796875, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5573270916938782, + "rewards/margins": 7.487240791320801, + "rewards/rejected": -8.044568061828613, + "step": 1910 + }, + { + "epoch": 0.11, + "learning_rate": 4.996802735471461e-06, + "logits/chosen": -2.92683744430542, + "logits/rejected": -2.8211593627929688, + "logps/chosen": -86.11729431152344, + "logps/rejected": -835.4078369140625, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16640424728393555, + "rewards/margins": 7.743669033050537, + "rewards/rejected": -7.910073757171631, + "step": 1920 + }, + { + "epoch": 0.12, + "learning_rate": 4.996534233600531e-06, + "logits/chosen": -2.905492067337036, + "logits/rejected": -2.7942118644714355, + "logps/chosen": -81.23568725585938, + "logps/rejected": -751.8267822265625, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09233834594488144, + "rewards/margins": 6.9861955642700195, + "rewards/rejected": -7.078534126281738, + "step": 1930 + }, + { + "epoch": 0.12, + "learning_rate": 4.996254915249138e-06, + "logits/chosen": -2.937582015991211, + "logits/rejected": -2.819331407546997, + "logps/chosen": -108.2693862915039, + "logps/rejected": -898.1185302734375, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40084370970726013, + "rewards/margins": 8.131311416625977, + "rewards/rejected": -8.53215503692627, + "step": 1940 + }, + { + "epoch": 0.12, + "learning_rate": 4.995964781627457e-06, + "logits/chosen": -2.937994956970215, + "logits/rejected": -2.84669828414917, + "logps/chosen": -94.46736907958984, + "logps/rejected": -935.9000854492188, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2815585732460022, + "rewards/margins": 8.637256622314453, + "rewards/rejected": -8.918814659118652, + "step": 1950 + }, + { + "epoch": 0.12, + "learning_rate": 4.99566383399252e-06, + "logits/chosen": -2.9156644344329834, + "logits/rejected": -2.817598819732666, + "logps/chosen": -91.17527770996094, + "logps/rejected": -878.8995361328125, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.198879212141037, + "rewards/margins": 8.136736869812012, + "rewards/rejected": -8.335616111755371, + "step": 1960 + }, + { + "epoch": 0.12, + "learning_rate": 4.995352073648213e-06, + "logits/chosen": -2.9092535972595215, + "logits/rejected": -2.8243935108184814, + "logps/chosen": -107.5519027709961, + "logps/rejected": -924.94189453125, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3393505811691284, + "rewards/margins": 8.469963073730469, + "rewards/rejected": -8.80931282043457, + "step": 1970 + }, + { + "epoch": 0.12, + "learning_rate": 4.9950295019452665e-06, + "logits/chosen": -2.92537260055542, + "logits/rejected": -2.8429553508758545, + "logps/chosen": -77.56587219238281, + "logps/rejected": -926.5347900390625, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09293019771575928, + "rewards/margins": 8.726125717163086, + "rewards/rejected": -8.819055557250977, + "step": 1980 + }, + { + "epoch": 0.12, + "learning_rate": 4.9946961202812566e-06, + "logits/chosen": -2.9121599197387695, + "logits/rejected": -2.8202285766601562, + "logps/chosen": -101.99006652832031, + "logps/rejected": -943.1390380859375, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2530006468296051, + "rewards/margins": 8.725936889648438, + "rewards/rejected": -8.978937149047852, + "step": 1990 + }, + { + "epoch": 0.12, + "learning_rate": 4.99435193010059e-06, + "logits/chosen": -2.902696371078491, + "logits/rejected": -2.8028347492218018, + "logps/chosen": -80.13379669189453, + "logps/rejected": -770.0050048828125, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1043882742524147, + "rewards/margins": 7.15741491317749, + "rewards/rejected": -7.26180362701416, + "step": 2000 + }, + { + "epoch": 0.12, + "learning_rate": 4.993996932894507e-06, + "logits/chosen": -2.9226884841918945, + "logits/rejected": -2.803157091140747, + "logps/chosen": -73.43228912353516, + "logps/rejected": -900.3966064453125, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007940527983009815, + "rewards/margins": 8.555150032043457, + "rewards/rejected": -8.563089370727539, + "step": 2010 + }, + { + "epoch": 0.12, + "learning_rate": 4.993631130201066e-06, + "logits/chosen": -2.9101755619049072, + "logits/rejected": -2.819037914276123, + "logps/chosen": -75.83551025390625, + "logps/rejected": -907.328125, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.052049994468688965, + "rewards/margins": 8.569490432739258, + "rewards/rejected": -8.621540069580078, + "step": 2020 + }, + { + "epoch": 0.12, + "learning_rate": 4.993254523605144e-06, + "logits/chosen": -2.9077773094177246, + "logits/rejected": -2.8096206188201904, + "logps/chosen": -71.08650970458984, + "logps/rejected": -894.9310302734375, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03352269157767296, + "rewards/margins": 8.482320785522461, + "rewards/rejected": -8.515843391418457, + "step": 2030 + }, + { + "epoch": 0.12, + "learning_rate": 4.9928671147384255e-06, + "logits/chosen": -2.8848633766174316, + "logits/rejected": -2.776020050048828, + "logps/chosen": -70.45372009277344, + "logps/rejected": -960.6707763671875, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011269062757492065, + "rewards/margins": 9.143277168273926, + "rewards/rejected": -9.154546737670898, + "step": 2040 + }, + { + "epoch": 0.12, + "learning_rate": 4.992468905279398e-06, + "logits/chosen": -2.867745876312256, + "logits/rejected": -2.7999446392059326, + "logps/chosen": -75.90371704101562, + "logps/rejected": -925.2644653320312, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05030970647931099, + "rewards/margins": 8.771891593933105, + "rewards/rejected": -8.822200775146484, + "step": 2050 + }, + { + "epoch": 0.12, + "learning_rate": 4.992059896953343e-06, + "logits/chosen": -2.921201229095459, + "logits/rejected": -2.8354499340057373, + "logps/chosen": -77.17427062988281, + "logps/rejected": -842.29296875, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09570114314556122, + "rewards/margins": 7.881175994873047, + "rewards/rejected": -7.9768781661987305, + "step": 2060 + }, + { + "epoch": 0.12, + "learning_rate": 4.99164009153233e-06, + "logits/chosen": -2.9181408882141113, + "logits/rejected": -2.828521966934204, + "logps/chosen": -105.7486801147461, + "logps/rejected": -875.3050537109375, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35077470541000366, + "rewards/margins": 7.954843997955322, + "rewards/rejected": -8.305618286132812, + "step": 2070 + }, + { + "epoch": 0.12, + "learning_rate": 4.991209490835207e-06, + "logits/chosen": -2.9404006004333496, + "logits/rejected": -2.83046817779541, + "logps/chosen": -126.86375427246094, + "logps/rejected": -844.8680419921875, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5857782959938049, + "rewards/margins": 7.417706489562988, + "rewards/rejected": -8.003484725952148, + "step": 2080 + }, + { + "epoch": 0.12, + "learning_rate": 4.990768096727594e-06, + "logits/chosen": -2.9306182861328125, + "logits/rejected": -2.814241409301758, + "logps/chosen": -103.2199478149414, + "logps/rejected": -901.07421875, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2994753420352936, + "rewards/margins": 8.261815071105957, + "rewards/rejected": -8.561290740966797, + "step": 2090 + }, + { + "epoch": 0.13, + "learning_rate": 4.990315911121874e-06, + "logits/chosen": -2.9294815063476562, + "logits/rejected": -2.8447818756103516, + "logps/chosen": -73.66078186035156, + "logps/rejected": -934.5150146484375, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07724453508853912, + "rewards/margins": 8.815179824829102, + "rewards/rejected": -8.892423629760742, + "step": 2100 + }, + { + "epoch": 0.13, + "learning_rate": 4.989852935977187e-06, + "logits/chosen": -2.903233766555786, + "logits/rejected": -2.8111109733581543, + "logps/chosen": -71.41477966308594, + "logps/rejected": -884.0208740234375, + "loss": 0.0334, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.02149464190006256, + "rewards/margins": 8.372645378112793, + "rewards/rejected": -8.394139289855957, + "step": 2110 + }, + { + "epoch": 0.13, + "learning_rate": 4.989379173299416e-06, + "logits/chosen": -2.946033477783203, + "logits/rejected": -2.8650143146514893, + "logps/chosen": -65.48561096191406, + "logps/rejected": -891.6853637695312, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0060962652787566185, + "rewards/margins": 8.4716215133667, + "rewards/rejected": -8.465524673461914, + "step": 2120 + }, + { + "epoch": 0.13, + "learning_rate": 4.988894625141186e-06, + "logits/chosen": -2.9310858249664307, + "logits/rejected": -2.846900701522827, + "logps/chosen": -96.44914245605469, + "logps/rejected": -933.12548828125, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24209725856781006, + "rewards/margins": 8.64737319946289, + "rewards/rejected": -8.889471054077148, + "step": 2130 + }, + { + "epoch": 0.13, + "learning_rate": 4.98839929360185e-06, + "logits/chosen": -2.91212797164917, + "logits/rejected": -2.789159059524536, + "logps/chosen": -123.30867004394531, + "logps/rejected": -930.87060546875, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49023550748825073, + "rewards/margins": 8.366610527038574, + "rewards/rejected": -8.856844902038574, + "step": 2140 + }, + { + "epoch": 0.13, + "learning_rate": 4.9878931808274796e-06, + "logits/chosen": -2.914276599884033, + "logits/rejected": -2.8237998485565186, + "logps/chosen": -82.78339385986328, + "logps/rejected": -874.6910400390625, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2022639811038971, + "rewards/margins": 8.109360694885254, + "rewards/rejected": -8.311624526977539, + "step": 2150 + }, + { + "epoch": 0.13, + "learning_rate": 4.9873762890108596e-06, + "logits/chosen": -2.940214157104492, + "logits/rejected": -2.8720109462738037, + "logps/chosen": -75.28640747070312, + "logps/rejected": -923.8096923828125, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03417595475912094, + "rewards/margins": 8.761570930480957, + "rewards/rejected": -8.795747756958008, + "step": 2160 + }, + { + "epoch": 0.13, + "learning_rate": 4.986848620391473e-06, + "logits/chosen": -2.900097608566284, + "logits/rejected": -2.8351101875305176, + "logps/chosen": -127.07694244384766, + "logps/rejected": -902.7859497070312, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5734145045280457, + "rewards/margins": 8.012839317321777, + "rewards/rejected": -8.586254119873047, + "step": 2170 + }, + { + "epoch": 0.13, + "learning_rate": 4.986310177255498e-06, + "logits/chosen": -2.905867099761963, + "logits/rejected": -2.8195083141326904, + "logps/chosen": -134.7004852294922, + "logps/rejected": -868.62939453125, + "loss": 0.0614, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6421254277229309, + "rewards/margins": 7.5909247398376465, + "rewards/rejected": -8.233050346374512, + "step": 2180 + }, + { + "epoch": 0.13, + "learning_rate": 4.985760961935791e-06, + "logits/chosen": -2.9291980266571045, + "logits/rejected": -2.864793539047241, + "logps/chosen": -71.02049255371094, + "logps/rejected": -910.2662963867188, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005708605982363224, + "rewards/margins": 8.673110008239746, + "rewards/rejected": -8.667402267456055, + "step": 2190 + }, + { + "epoch": 0.13, + "learning_rate": 4.985200976811882e-06, + "logits/chosen": -2.917705774307251, + "logits/rejected": -2.8427376747131348, + "logps/chosen": -63.59644317626953, + "logps/rejected": -901.7972412109375, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.050329696387052536, + "rewards/margins": 8.622421264648438, + "rewards/rejected": -8.572092056274414, + "step": 2200 + }, + { + "epoch": 0.13, + "learning_rate": 4.9846302243099624e-06, + "logits/chosen": -2.913543224334717, + "logits/rejected": -2.846938371658325, + "logps/chosen": -79.2912826538086, + "logps/rejected": -919.1541748046875, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08877657353878021, + "rewards/margins": 8.645597457885742, + "rewards/rejected": -8.734375, + "step": 2210 + }, + { + "epoch": 0.13, + "learning_rate": 4.984048706902872e-06, + "logits/chosen": -2.9400038719177246, + "logits/rejected": -2.815504789352417, + "logps/chosen": -88.97303771972656, + "logps/rejected": -924.6090698242188, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2140566110610962, + "rewards/margins": 8.580434799194336, + "rewards/rejected": -8.794490814208984, + "step": 2220 + }, + { + "epoch": 0.13, + "learning_rate": 4.9834564271100925e-06, + "logits/chosen": -2.940690755844116, + "logits/rejected": -2.86393666267395, + "logps/chosen": -78.72700500488281, + "logps/rejected": -940.3984375, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11535857617855072, + "rewards/margins": 8.834736824035645, + "rewards/rejected": -8.950096130371094, + "step": 2230 + }, + { + "epoch": 0.13, + "learning_rate": 4.982853387497737e-06, + "logits/chosen": -2.916646957397461, + "logits/rejected": -2.8344645500183105, + "logps/chosen": -83.87245178222656, + "logps/rejected": -892.7063598632812, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13665243983268738, + "rewards/margins": 8.350369453430176, + "rewards/rejected": -8.487020492553711, + "step": 2240 + }, + { + "epoch": 0.13, + "learning_rate": 4.98223959067853e-06, + "logits/chosen": -2.8973255157470703, + "logits/rejected": -2.8041720390319824, + "logps/chosen": -108.44938659667969, + "logps/rejected": -1021.1028442382812, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3960796594619751, + "rewards/margins": 9.363378524780273, + "rewards/rejected": -9.759458541870117, + "step": 2250 + }, + { + "epoch": 0.13, + "learning_rate": 4.9816150393118105e-06, + "logits/chosen": -2.9236443042755127, + "logits/rejected": -2.8413333892822266, + "logps/chosen": -95.8695297241211, + "logps/rejected": -860.6937255859375, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2797923982143402, + "rewards/margins": 7.885159969329834, + "rewards/rejected": -8.164952278137207, + "step": 2260 + }, + { + "epoch": 0.14, + "learning_rate": 4.980979736103506e-06, + "logits/chosen": -2.913989543914795, + "logits/rejected": -2.8314120769500732, + "logps/chosen": -82.67513275146484, + "logps/rejected": -938.8826293945312, + "loss": 0.0241, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10101115703582764, + "rewards/margins": 8.846435546875, + "rewards/rejected": -8.947446823120117, + "step": 2270 + }, + { + "epoch": 0.14, + "learning_rate": 4.980333683806132e-06, + "logits/chosen": -2.941417932510376, + "logits/rejected": -2.81229829788208, + "logps/chosen": -83.7132568359375, + "logps/rejected": -839.8123168945312, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10666797310113907, + "rewards/margins": 7.8471856117248535, + "rewards/rejected": -7.953853607177734, + "step": 2280 + }, + { + "epoch": 0.14, + "learning_rate": 4.979676885218772e-06, + "logits/chosen": -2.8838696479797363, + "logits/rejected": -2.7944722175598145, + "logps/chosen": -75.11915588378906, + "logps/rejected": -887.5614013671875, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0021694537717849016, + "rewards/margins": 8.430620193481445, + "rewards/rejected": -8.432788848876953, + "step": 2290 + }, + { + "epoch": 0.14, + "learning_rate": 4.979009343187073e-06, + "logits/chosen": -2.9060213565826416, + "logits/rejected": -2.8375351428985596, + "logps/chosen": -71.66847229003906, + "logps/rejected": -880.8853759765625, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01039136666804552, + "rewards/margins": 8.36555290222168, + "rewards/rejected": -8.375945091247559, + "step": 2300 + }, + { + "epoch": 0.14, + "learning_rate": 4.9783310606032245e-06, + "logits/chosen": -2.9355666637420654, + "logits/rejected": -2.856356143951416, + "logps/chosen": -70.37708282470703, + "logps/rejected": -908.4384765625, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026617299765348434, + "rewards/margins": 8.613717079162598, + "rewards/rejected": -8.640335083007812, + "step": 2310 + }, + { + "epoch": 0.14, + "learning_rate": 4.977642040405954e-06, + "logits/chosen": -2.8900671005249023, + "logits/rejected": -2.8329005241394043, + "logps/chosen": -61.27196502685547, + "logps/rejected": -914.7413330078125, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07489734143018723, + "rewards/margins": 8.778996467590332, + "rewards/rejected": -8.70409870147705, + "step": 2320 + }, + { + "epoch": 0.14, + "learning_rate": 4.976942285580507e-06, + "logits/chosen": -2.9066126346588135, + "logits/rejected": -2.8480751514434814, + "logps/chosen": -77.35345458984375, + "logps/rejected": -772.4773559570312, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07964875549077988, + "rewards/margins": 7.202715873718262, + "rewards/rejected": -7.282364845275879, + "step": 2330 + }, + { + "epoch": 0.14, + "learning_rate": 4.976231799158643e-06, + "logits/chosen": -2.9451980590820312, + "logits/rejected": -2.8087496757507324, + "logps/chosen": -102.09986114501953, + "logps/rejected": -941.6900634765625, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34645453095436096, + "rewards/margins": 8.632307052612305, + "rewards/rejected": -8.978763580322266, + "step": 2340 + }, + { + "epoch": 0.14, + "learning_rate": 4.975510584218614e-06, + "logits/chosen": -2.9150779247283936, + "logits/rejected": -2.8100497722625732, + "logps/chosen": -127.3231430053711, + "logps/rejected": -917.0535888671875, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.602421224117279, + "rewards/margins": 8.135290145874023, + "rewards/rejected": -8.737710952758789, + "step": 2350 + }, + { + "epoch": 0.14, + "learning_rate": 4.974778643885153e-06, + "logits/chosen": -2.89973783493042, + "logits/rejected": -2.818477153778076, + "logps/chosen": -113.25349426269531, + "logps/rejected": -923.7726440429688, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4272824823856354, + "rewards/margins": 8.35981559753418, + "rewards/rejected": -8.787099838256836, + "step": 2360 + }, + { + "epoch": 0.14, + "learning_rate": 4.974035981329465e-06, + "logits/chosen": -2.9361205101013184, + "logits/rejected": -2.8323092460632324, + "logps/chosen": -103.0582275390625, + "logps/rejected": -849.3351440429688, + "loss": 0.0491, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.29984036087989807, + "rewards/margins": 7.7639617919921875, + "rewards/rejected": -8.063802719116211, + "step": 2370 + }, + { + "epoch": 0.14, + "learning_rate": 4.973282599769207e-06, + "logits/chosen": -2.898646354675293, + "logits/rejected": -2.789358615875244, + "logps/chosen": -100.55979919433594, + "logps/rejected": -942.27734375, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33725738525390625, + "rewards/margins": 8.635374069213867, + "rewards/rejected": -8.972631454467773, + "step": 2380 + }, + { + "epoch": 0.14, + "learning_rate": 4.972518502468482e-06, + "logits/chosen": -2.89788556098938, + "logits/rejected": -2.824398994445801, + "logps/chosen": -119.30379486083984, + "logps/rejected": -849.9752197265625, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.519378662109375, + "rewards/margins": 7.53745174407959, + "rewards/rejected": -8.056829452514648, + "step": 2390 + }, + { + "epoch": 0.14, + "learning_rate": 4.971743692737814e-06, + "logits/chosen": -2.9084324836730957, + "logits/rejected": -2.786465883255005, + "logps/chosen": -135.71572875976562, + "logps/rejected": -896.41455078125, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6771860122680664, + "rewards/margins": 7.848405361175537, + "rewards/rejected": -8.525590896606445, + "step": 2400 + }, + { + "epoch": 0.14, + "learning_rate": 4.970958173934144e-06, + "logits/chosen": -2.9394097328186035, + "logits/rejected": -2.847304582595825, + "logps/chosen": -116.1565933227539, + "logps/rejected": -964.9874267578125, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42276591062545776, + "rewards/margins": 8.783390045166016, + "rewards/rejected": -9.206155776977539, + "step": 2410 + }, + { + "epoch": 0.14, + "learning_rate": 4.970161949460808e-06, + "logits/chosen": -2.910912036895752, + "logits/rejected": -2.8202598094940186, + "logps/chosen": -129.80703735351562, + "logps/rejected": -878.2278442382812, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5632044076919556, + "rewards/margins": 7.766985893249512, + "rewards/rejected": -8.33018970489502, + "step": 2420 + }, + { + "epoch": 0.14, + "learning_rate": 4.969355022767529e-06, + "logits/chosen": -2.9213080406188965, + "logits/rejected": -2.821371555328369, + "logps/chosen": -147.2045440673828, + "logps/rejected": -948.5740356445312, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7376371622085571, + "rewards/margins": 8.297497749328613, + "rewards/rejected": -9.035135269165039, + "step": 2430 + }, + { + "epoch": 0.15, + "learning_rate": 4.968537397350395e-06, + "logits/chosen": -2.93369460105896, + "logits/rejected": -2.8000893592834473, + "logps/chosen": -118.7147216796875, + "logps/rejected": -1059.5216064453125, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4695959687232971, + "rewards/margins": 9.663223266601562, + "rewards/rejected": -10.132821083068848, + "step": 2440 + }, + { + "epoch": 0.15, + "learning_rate": 4.967709076751848e-06, + "logits/chosen": -2.9022932052612305, + "logits/rejected": -2.7864482402801514, + "logps/chosen": -106.00236511230469, + "logps/rejected": -884.3004150390625, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33720219135284424, + "rewards/margins": 8.058537483215332, + "rewards/rejected": -8.395739555358887, + "step": 2450 + }, + { + "epoch": 0.15, + "learning_rate": 4.96687006456067e-06, + "logits/chosen": -2.9117045402526855, + "logits/rejected": -2.826190948486328, + "logps/chosen": -85.51109313964844, + "logps/rejected": -891.7425537109375, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12658901512622833, + "rewards/margins": 8.349087715148926, + "rewards/rejected": -8.475676536560059, + "step": 2460 + }, + { + "epoch": 0.15, + "learning_rate": 4.966020364411964e-06, + "logits/chosen": -2.931962013244629, + "logits/rejected": -2.825965404510498, + "logps/chosen": -99.91218566894531, + "logps/rejected": -877.6570434570312, + "loss": 0.0288, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.24606366455554962, + "rewards/margins": 8.083056449890137, + "rewards/rejected": -8.329119682312012, + "step": 2470 + }, + { + "epoch": 0.15, + "learning_rate": 4.965159979987139e-06, + "logits/chosen": -2.9141736030578613, + "logits/rejected": -2.8199572563171387, + "logps/chosen": -110.23834228515625, + "logps/rejected": -953.1521606445312, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3438529968261719, + "rewards/margins": 8.732954025268555, + "rewards/rejected": -9.076807975769043, + "step": 2480 + }, + { + "epoch": 0.15, + "learning_rate": 4.964288915013895e-06, + "logits/chosen": -2.9574790000915527, + "logits/rejected": -2.8234541416168213, + "logps/chosen": -80.96260070800781, + "logps/rejected": -978.5983276367188, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10227058082818985, + "rewards/margins": 9.238229751586914, + "rewards/rejected": -9.340500831604004, + "step": 2490 + }, + { + "epoch": 0.15, + "learning_rate": 4.963407173266208e-06, + "logits/chosen": -2.9136593341827393, + "logits/rejected": -2.807762861251831, + "logps/chosen": -77.60491943359375, + "logps/rejected": -871.3599853515625, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09069880098104477, + "rewards/margins": 8.173863410949707, + "rewards/rejected": -8.264561653137207, + "step": 2500 + }, + { + "epoch": 0.15, + "learning_rate": 4.962514758564309e-06, + "logits/chosen": -2.8861048221588135, + "logits/rejected": -2.8033528327941895, + "logps/chosen": -61.968505859375, + "logps/rejected": -1041.1016845703125, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05064171552658081, + "rewards/margins": 10.013525009155273, + "rewards/rejected": -9.962882041931152, + "step": 2510 + }, + { + "epoch": 0.15, + "learning_rate": 4.961611674774674e-06, + "logits/chosen": -2.926732301712036, + "logits/rejected": -2.827977418899536, + "logps/chosen": -71.05899047851562, + "logps/rejected": -848.5643310546875, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04070550575852394, + "rewards/margins": 8.001574516296387, + "rewards/rejected": -8.042280197143555, + "step": 2520 + }, + { + "epoch": 0.15, + "learning_rate": 4.960697925810003e-06, + "logits/chosen": -2.9085631370544434, + "logits/rejected": -2.8304319381713867, + "logps/chosen": -92.6507797241211, + "logps/rejected": -1018.9605712890625, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21855959296226501, + "rewards/margins": 9.519502639770508, + "rewards/rejected": -9.738062858581543, + "step": 2530 + }, + { + "epoch": 0.15, + "learning_rate": 4.9597735156292024e-06, + "logits/chosen": -2.9259884357452393, + "logits/rejected": -2.8063361644744873, + "logps/chosen": -86.19991302490234, + "logps/rejected": -892.0320434570312, + "loss": 0.0308, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.11056070029735565, + "rewards/margins": 8.37292766571045, + "rewards/rejected": -8.483488082885742, + "step": 2540 + }, + { + "epoch": 0.15, + "learning_rate": 4.9588384482373695e-06, + "logits/chosen": -2.884809970855713, + "logits/rejected": -2.8330957889556885, + "logps/chosen": -98.22537231445312, + "logps/rejected": -938.7869873046875, + "loss": 0.0597, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2814822196960449, + "rewards/margins": 8.667036056518555, + "rewards/rejected": -8.948518753051758, + "step": 2550 + }, + { + "epoch": 0.15, + "learning_rate": 4.957892727685778e-06, + "logits/chosen": -2.9501452445983887, + "logits/rejected": -2.847529649734497, + "logps/chosen": -107.39640808105469, + "logps/rejected": -986.8916015625, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34068137407302856, + "rewards/margins": 9.089051246643066, + "rewards/rejected": -9.429732322692871, + "step": 2560 + }, + { + "epoch": 0.15, + "learning_rate": 4.956936358071853e-06, + "logits/chosen": -2.9261088371276855, + "logits/rejected": -2.828829765319824, + "logps/chosen": -119.02828216552734, + "logps/rejected": -933.5569458007812, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4049844741821289, + "rewards/margins": 8.486495018005371, + "rewards/rejected": -8.8914794921875, + "step": 2570 + }, + { + "epoch": 0.15, + "learning_rate": 4.955969343539162e-06, + "logits/chosen": -2.906745433807373, + "logits/rejected": -2.770998001098633, + "logps/chosen": -119.28253173828125, + "logps/rejected": -986.7303466796875, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4944824278354645, + "rewards/margins": 8.923219680786133, + "rewards/rejected": -9.417702674865723, + "step": 2580 + }, + { + "epoch": 0.15, + "learning_rate": 4.954991688277391e-06, + "logits/chosen": -2.867947816848755, + "logits/rejected": -2.789060115814209, + "logps/chosen": -156.2045135498047, + "logps/rejected": -965.4269409179688, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8836523294448853, + "rewards/margins": 8.333108901977539, + "rewards/rejected": -9.216760635375977, + "step": 2590 + }, + { + "epoch": 0.16, + "learning_rate": 4.954003396522325e-06, + "logits/chosen": -2.94377064704895, + "logits/rejected": -2.8526902198791504, + "logps/chosen": -158.0526885986328, + "logps/rejected": -1045.19482421875, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9019582867622375, + "rewards/margins": 9.109042167663574, + "rewards/rejected": -10.011000633239746, + "step": 2600 + }, + { + "epoch": 0.16, + "learning_rate": 4.953004472555838e-06, + "logits/chosen": -2.913330316543579, + "logits/rejected": -2.7569892406463623, + "logps/chosen": -148.7244415283203, + "logps/rejected": -857.0618896484375, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7785552144050598, + "rewards/margins": 7.3511457443237305, + "rewards/rejected": -8.129701614379883, + "step": 2610 + }, + { + "epoch": 0.16, + "learning_rate": 4.951994920705865e-06, + "logits/chosen": -2.9549460411071777, + "logits/rejected": -2.8786098957061768, + "logps/chosen": -102.9132308959961, + "logps/rejected": -871.0667724609375, + "loss": 0.0901, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.32913029193878174, + "rewards/margins": 7.944624423980713, + "rewards/rejected": -8.273754119873047, + "step": 2620 + }, + { + "epoch": 0.16, + "learning_rate": 4.95097474534639e-06, + "logits/chosen": -2.906001567840576, + "logits/rejected": -2.8609766960144043, + "logps/chosen": -108.26625061035156, + "logps/rejected": -946.1947021484375, + "loss": 0.0381, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.328972190618515, + "rewards/margins": 8.691851615905762, + "rewards/rejected": -9.020824432373047, + "step": 2630 + }, + { + "epoch": 0.16, + "learning_rate": 4.949943950897422e-06, + "logits/chosen": -2.9310507774353027, + "logits/rejected": -2.8409712314605713, + "logps/chosen": -76.93956756591797, + "logps/rejected": -872.7999877929688, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0723833218216896, + "rewards/margins": 8.212732315063477, + "rewards/rejected": -8.285116195678711, + "step": 2640 + }, + { + "epoch": 0.16, + "learning_rate": 4.94890254182498e-06, + "logits/chosen": -2.9161858558654785, + "logits/rejected": -2.83249568939209, + "logps/chosen": -74.18345642089844, + "logps/rejected": -844.9074096679688, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012019271962344646, + "rewards/margins": 7.98058557510376, + "rewards/rejected": -7.992604732513428, + "step": 2650 + }, + { + "epoch": 0.16, + "learning_rate": 4.947850522641072e-06, + "logits/chosen": -2.9110209941864014, + "logits/rejected": -2.8343100547790527, + "logps/chosen": -81.40888214111328, + "logps/rejected": -758.9796142578125, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1388753354549408, + "rewards/margins": 7.00775671005249, + "rewards/rejected": -7.146633148193359, + "step": 2660 + }, + { + "epoch": 0.16, + "learning_rate": 4.946787897903674e-06, + "logits/chosen": -2.9253344535827637, + "logits/rejected": -2.802432060241699, + "logps/chosen": -81.86743927001953, + "logps/rejected": -1033.0467529296875, + "loss": 0.0869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10440780967473984, + "rewards/margins": 9.79059886932373, + "rewards/rejected": -9.89500617980957, + "step": 2670 + }, + { + "epoch": 0.16, + "learning_rate": 4.945714672216713e-06, + "logits/chosen": -2.910594940185547, + "logits/rejected": -2.8370227813720703, + "logps/chosen": -86.73094940185547, + "logps/rejected": -1026.197021484375, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15769681334495544, + "rewards/margins": 9.66103744506836, + "rewards/rejected": -9.818734169006348, + "step": 2680 + }, + { + "epoch": 0.16, + "learning_rate": 4.944630850230045e-06, + "logits/chosen": -2.907191038131714, + "logits/rejected": -2.8154234886169434, + "logps/chosen": -81.18495178222656, + "logps/rejected": -901.1973876953125, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06847550719976425, + "rewards/margins": 8.506799697875977, + "rewards/rejected": -8.575275421142578, + "step": 2690 + }, + { + "epoch": 0.16, + "learning_rate": 4.9435364366394334e-06, + "logits/chosen": -2.9387497901916504, + "logits/rejected": -2.823193311691284, + "logps/chosen": -71.61036682128906, + "logps/rejected": -825.7556762695312, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0704922154545784, + "rewards/margins": 7.746884346008301, + "rewards/rejected": -7.817376613616943, + "step": 2700 + }, + { + "epoch": 0.16, + "learning_rate": 4.942431436186536e-06, + "logits/chosen": -2.8955795764923096, + "logits/rejected": -2.7983651161193848, + "logps/chosen": -69.2351303100586, + "logps/rejected": -830.044921875, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015279693529009819, + "rewards/margins": 7.844470024108887, + "rewards/rejected": -7.859750270843506, + "step": 2710 + }, + { + "epoch": 0.16, + "learning_rate": 4.941315853658873e-06, + "logits/chosen": -2.912855863571167, + "logits/rejected": -2.789764881134033, + "logps/chosen": -95.71670532226562, + "logps/rejected": -865.7041015625, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17794740200042725, + "rewards/margins": 8.018354415893555, + "rewards/rejected": -8.196301460266113, + "step": 2720 + }, + { + "epoch": 0.16, + "learning_rate": 4.940189693889819e-06, + "logits/chosen": -2.9068188667297363, + "logits/rejected": -2.7605984210968018, + "logps/chosen": -98.43501281738281, + "logps/rejected": -935.4601440429688, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2977240979671478, + "rewards/margins": 8.620194435119629, + "rewards/rejected": -8.917917251586914, + "step": 2730 + }, + { + "epoch": 0.16, + "learning_rate": 4.939052961758569e-06, + "logits/chosen": -2.9329752922058105, + "logits/rejected": -2.8329977989196777, + "logps/chosen": -83.51493072509766, + "logps/rejected": -970.2652587890625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14018702507019043, + "rewards/margins": 9.116633415222168, + "rewards/rejected": -9.256821632385254, + "step": 2740 + }, + { + "epoch": 0.16, + "learning_rate": 4.937905662190129e-06, + "logits/chosen": -2.926013231277466, + "logits/rejected": -2.8018393516540527, + "logps/chosen": -83.0693359375, + "logps/rejected": -947.9988403320312, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1644803136587143, + "rewards/margins": 8.875802040100098, + "rewards/rejected": -9.040281295776367, + "step": 2750 + }, + { + "epoch": 0.16, + "learning_rate": 4.936747800155285e-06, + "logits/chosen": -2.8951776027679443, + "logits/rejected": -2.825871467590332, + "logps/chosen": -69.45985412597656, + "logps/rejected": -886.7191162109375, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02263670228421688, + "rewards/margins": 8.406549453735352, + "rewards/rejected": -8.42918586730957, + "step": 2760 + }, + { + "epoch": 0.17, + "learning_rate": 4.935579380670592e-06, + "logits/chosen": -2.9374606609344482, + "logits/rejected": -2.8428306579589844, + "logps/chosen": -102.8312759399414, + "logps/rejected": -990.3069458007812, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28860825300216675, + "rewards/margins": 9.165548324584961, + "rewards/rejected": -9.454156875610352, + "step": 2770 + }, + { + "epoch": 0.17, + "learning_rate": 4.934400408798339e-06, + "logits/chosen": -2.8919525146484375, + "logits/rejected": -2.782038927078247, + "logps/chosen": -70.26911926269531, + "logps/rejected": -905.73681640625, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0039605796337127686, + "rewards/margins": 8.629324913024902, + "rewards/rejected": -8.625364303588867, + "step": 2780 + }, + { + "epoch": 0.17, + "learning_rate": 4.93321088964654e-06, + "logits/chosen": -2.9169554710388184, + "logits/rejected": -2.800493001937866, + "logps/chosen": -84.66515350341797, + "logps/rejected": -965.3494262695312, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13011623919010162, + "rewards/margins": 9.08696174621582, + "rewards/rejected": -9.217077255249023, + "step": 2790 + }, + { + "epoch": 0.17, + "learning_rate": 4.932010828368903e-06, + "logits/chosen": -2.9324235916137695, + "logits/rejected": -2.8542137145996094, + "logps/chosen": -124.497314453125, + "logps/rejected": -964.9666748046875, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5493017435073853, + "rewards/margins": 8.65716552734375, + "rewards/rejected": -9.206467628479004, + "step": 2800 + }, + { + "epoch": 0.17, + "learning_rate": 4.930800230164812e-06, + "logits/chosen": -2.9189116954803467, + "logits/rejected": -2.7923741340637207, + "logps/chosen": -115.26444244384766, + "logps/rejected": -1061.429443359375, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45345592498779297, + "rewards/margins": 9.718252182006836, + "rewards/rejected": -10.171709060668945, + "step": 2810 + }, + { + "epoch": 0.17, + "learning_rate": 4.929579100279302e-06, + "logits/chosen": -2.934934616088867, + "logits/rejected": -2.8368613719940186, + "logps/chosen": -96.55180358886719, + "logps/rejected": -878.8713989257812, + "loss": 0.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2639729678630829, + "rewards/margins": 8.087080001831055, + "rewards/rejected": -8.351053237915039, + "step": 2820 + }, + { + "epoch": 0.17, + "learning_rate": 4.92834744400304e-06, + "logits/chosen": -2.9105095863342285, + "logits/rejected": -2.821148633956909, + "logps/chosen": -80.38851165771484, + "logps/rejected": -1007.0836181640625, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09356964379549026, + "rewards/margins": 9.5266752243042, + "rewards/rejected": -9.620244026184082, + "step": 2830 + }, + { + "epoch": 0.17, + "learning_rate": 4.927105266672296e-06, + "logits/chosen": -2.926135540008545, + "logits/rejected": -2.81009840965271, + "logps/chosen": -103.74556732177734, + "logps/rejected": -996.8015747070312, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27629369497299194, + "rewards/margins": 9.234261512756348, + "rewards/rejected": -9.510554313659668, + "step": 2840 + }, + { + "epoch": 0.17, + "learning_rate": 4.925852573668928e-06, + "logits/chosen": -2.9445695877075195, + "logits/rejected": -2.856632947921753, + "logps/chosen": -106.05985260009766, + "logps/rejected": -1072.9588623046875, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33258286118507385, + "rewards/margins": 9.954703330993652, + "rewards/rejected": -10.287286758422852, + "step": 2850 + }, + { + "epoch": 0.17, + "learning_rate": 4.924589370420351e-06, + "logits/chosen": -2.953538179397583, + "logits/rejected": -2.8282103538513184, + "logps/chosen": -128.39675903320312, + "logps/rejected": -931.0433349609375, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5655667781829834, + "rewards/margins": 8.30016040802002, + "rewards/rejected": -8.865727424621582, + "step": 2860 + }, + { + "epoch": 0.17, + "learning_rate": 4.923315662399517e-06, + "logits/chosen": -2.881783962249756, + "logits/rejected": -2.797868251800537, + "logps/chosen": -95.38026428222656, + "logps/rejected": -894.2131958007812, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19933325052261353, + "rewards/margins": 8.294595718383789, + "rewards/rejected": -8.493928909301758, + "step": 2870 + }, + { + "epoch": 0.17, + "learning_rate": 4.9220314551248915e-06, + "logits/chosen": -2.89984393119812, + "logits/rejected": -2.722132921218872, + "logps/chosen": -66.76728820800781, + "logps/rejected": -843.0881958007812, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03717636317014694, + "rewards/margins": 8.017894744873047, + "rewards/rejected": -7.980717658996582, + "step": 2880 + }, + { + "epoch": 0.17, + "learning_rate": 4.920736754160429e-06, + "logits/chosen": -2.9310741424560547, + "logits/rejected": -2.846287965774536, + "logps/chosen": -93.0264663696289, + "logps/rejected": -982.4656982421875, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21156001091003418, + "rewards/margins": 9.166433334350586, + "rewards/rejected": -9.3779935836792, + "step": 2890 + }, + { + "epoch": 0.17, + "learning_rate": 4.91943156511555e-06, + "logits/chosen": -2.929076910018921, + "logits/rejected": -2.810227155685425, + "logps/chosen": -99.96405029296875, + "logps/rejected": -1053.358154296875, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.340766042470932, + "rewards/margins": 9.75223445892334, + "rewards/rejected": -10.093001365661621, + "step": 2900 + }, + { + "epoch": 0.17, + "learning_rate": 4.918115893645113e-06, + "logits/chosen": -2.9340429306030273, + "logits/rejected": -2.8081812858581543, + "logps/chosen": -75.57527160644531, + "logps/rejected": -1050.577392578125, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0290999673306942, + "rewards/margins": 10.024075508117676, + "rewards/rejected": -10.053176879882812, + "step": 2910 + }, + { + "epoch": 0.17, + "learning_rate": 4.916789745449396e-06, + "logits/chosen": -2.903440475463867, + "logits/rejected": -2.7989277839660645, + "logps/chosen": -66.62117767333984, + "logps/rejected": -937.4807739257812, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.022950012236833572, + "rewards/margins": 8.961909294128418, + "rewards/rejected": -8.938959121704102, + "step": 2920 + }, + { + "epoch": 0.17, + "learning_rate": 4.915453126274065e-06, + "logits/chosen": -2.9188408851623535, + "logits/rejected": -2.78098726272583, + "logps/chosen": -90.40374755859375, + "logps/rejected": -899.36083984375, + "loss": 0.0391, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17839650809764862, + "rewards/margins": 8.379777908325195, + "rewards/rejected": -8.558175086975098, + "step": 2930 + }, + { + "epoch": 0.18, + "learning_rate": 4.914106041910155e-06, + "logits/chosen": -2.8941490650177, + "logits/rejected": -2.763282060623169, + "logps/chosen": -92.59281158447266, + "logps/rejected": -1018.7340087890625, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16338813304901123, + "rewards/margins": 9.582254409790039, + "rewards/rejected": -9.745641708374023, + "step": 2940 + }, + { + "epoch": 0.18, + "learning_rate": 4.9127484981940425e-06, + "logits/chosen": -2.9010350704193115, + "logits/rejected": -2.8127832412719727, + "logps/chosen": -91.32048034667969, + "logps/rejected": -1050.362548828125, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20067770779132843, + "rewards/margins": 9.852140426635742, + "rewards/rejected": -10.052818298339844, + "step": 2950 + }, + { + "epoch": 0.18, + "learning_rate": 4.911380501007417e-06, + "logits/chosen": -2.925996780395508, + "logits/rejected": -2.7819266319274902, + "logps/chosen": -196.75430297851562, + "logps/rejected": -961.32177734375, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.246008038520813, + "rewards/margins": 7.936345100402832, + "rewards/rejected": -9.182353019714355, + "step": 2960 + }, + { + "epoch": 0.18, + "learning_rate": 4.910002056277263e-06, + "logits/chosen": -2.8940131664276123, + "logits/rejected": -2.776315927505493, + "logps/chosen": -91.83622741699219, + "logps/rejected": -1009.2171630859375, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16851451992988586, + "rewards/margins": 9.462518692016602, + "rewards/rejected": -9.631032943725586, + "step": 2970 + }, + { + "epoch": 0.18, + "learning_rate": 4.908613169975828e-06, + "logits/chosen": -2.887906551361084, + "logits/rejected": -2.793120861053467, + "logps/chosen": -76.0972900390625, + "logps/rejected": -970.8505859375, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05901686102151871, + "rewards/margins": 9.19333267211914, + "rewards/rejected": -9.252348899841309, + "step": 2980 + }, + { + "epoch": 0.18, + "learning_rate": 4.9072138481205985e-06, + "logits/chosen": -2.8974432945251465, + "logits/rejected": -2.806164264678955, + "logps/chosen": -100.28203582763672, + "logps/rejected": -1064.16748046875, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3262271285057068, + "rewards/margins": 9.866116523742676, + "rewards/rejected": -10.192342758178711, + "step": 2990 + }, + { + "epoch": 0.18, + "learning_rate": 4.905804096774274e-06, + "logits/chosen": -2.895698070526123, + "logits/rejected": -2.798494815826416, + "logps/chosen": -88.13387298583984, + "logps/rejected": -995.4547119140625, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1971902847290039, + "rewards/margins": 9.32446575164795, + "rewards/rejected": -9.521655082702637, + "step": 3000 + }, + { + "epoch": 0.18, + "learning_rate": 4.90438392204474e-06, + "logits/chosen": -2.9238946437835693, + "logits/rejected": -2.8205199241638184, + "logps/chosen": -73.76292419433594, + "logps/rejected": -858.8571166992188, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014222566969692707, + "rewards/margins": 8.153276443481445, + "rewards/rejected": -8.139055252075195, + "step": 3010 + }, + { + "epoch": 0.18, + "learning_rate": 4.902953330085045e-06, + "logits/chosen": -2.9124464988708496, + "logits/rejected": -2.8200974464416504, + "logps/chosen": -57.89391326904297, + "logps/rejected": -861.4302978515625, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08842764049768448, + "rewards/margins": 8.274726867675781, + "rewards/rejected": -8.186299324035645, + "step": 3020 + }, + { + "epoch": 0.18, + "learning_rate": 4.901512327093369e-06, + "logits/chosen": -2.930318593978882, + "logits/rejected": -2.812051296234131, + "logps/chosen": -69.50514221191406, + "logps/rejected": -974.1229248046875, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03143477067351341, + "rewards/margins": 9.326597213745117, + "rewards/rejected": -9.295161247253418, + "step": 3030 + }, + { + "epoch": 0.18, + "learning_rate": 4.900060919313001e-06, + "logits/chosen": -2.9161267280578613, + "logits/rejected": -2.815610885620117, + "logps/chosen": -82.34928894042969, + "logps/rejected": -998.6640625, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1185515746474266, + "rewards/margins": 9.436986923217773, + "rewards/rejected": -9.555538177490234, + "step": 3040 + }, + { + "epoch": 0.18, + "learning_rate": 4.8985991130323055e-06, + "logits/chosen": -2.878901958465576, + "logits/rejected": -2.8079466819763184, + "logps/chosen": -78.50292205810547, + "logps/rejected": -855.5111083984375, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1105574369430542, + "rewards/margins": 7.991842746734619, + "rewards/rejected": -8.102399826049805, + "step": 3050 + }, + { + "epoch": 0.18, + "learning_rate": 4.8971269145847036e-06, + "logits/chosen": -2.8982720375061035, + "logits/rejected": -2.787421226501465, + "logps/chosen": -82.7979507446289, + "logps/rejected": -998.7786865234375, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16418203711509705, + "rewards/margins": 9.379179954528809, + "rewards/rejected": -9.54336166381836, + "step": 3060 + }, + { + "epoch": 0.18, + "learning_rate": 4.895644330348639e-06, + "logits/chosen": -2.9356489181518555, + "logits/rejected": -2.790642261505127, + "logps/chosen": -107.53193664550781, + "logps/rejected": -1074.806884765625, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43008285760879517, + "rewards/margins": 9.864014625549316, + "rewards/rejected": -10.294095993041992, + "step": 3070 + }, + { + "epoch": 0.18, + "learning_rate": 4.8941513667475545e-06, + "logits/chosen": -2.9424381256103516, + "logits/rejected": -2.8311257362365723, + "logps/chosen": -68.97103881835938, + "logps/rejected": -1078.7869873046875, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0492127500474453, + "rewards/margins": 10.294766426086426, + "rewards/rejected": -10.34398078918457, + "step": 3080 + }, + { + "epoch": 0.18, + "learning_rate": 4.892648030249863e-06, + "logits/chosen": -2.918834686279297, + "logits/rejected": -2.8089890480041504, + "logps/chosen": -92.81131744384766, + "logps/rejected": -974.1917724609375, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16505205631256104, + "rewards/margins": 9.129356384277344, + "rewards/rejected": -9.29440689086914, + "step": 3090 + }, + { + "epoch": 0.18, + "learning_rate": 4.891134327368919e-06, + "logits/chosen": -2.926624298095703, + "logits/rejected": -2.830854892730713, + "logps/chosen": -90.18801879882812, + "logps/rejected": -985.90283203125, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16908162832260132, + "rewards/margins": 9.234790802001953, + "rewards/rejected": -9.4038724899292, + "step": 3100 + }, + { + "epoch": 0.19, + "learning_rate": 4.889610264662984e-06, + "logits/chosen": -2.9370040893554688, + "logits/rejected": -2.7925350666046143, + "logps/chosen": -71.12479400634766, + "logps/rejected": -1097.640869140625, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02991882897913456, + "rewards/margins": 10.50147533416748, + "rewards/rejected": -10.531394004821777, + "step": 3110 + }, + { + "epoch": 0.19, + "learning_rate": 4.888075848735216e-06, + "logits/chosen": -2.9268550872802734, + "logits/rejected": -2.834078311920166, + "logps/chosen": -125.74955749511719, + "logps/rejected": -987.5769653320312, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5263983607292175, + "rewards/margins": 8.890470504760742, + "rewards/rejected": -9.416869163513184, + "step": 3120 + }, + { + "epoch": 0.19, + "learning_rate": 4.8865310862336185e-06, + "logits/chosen": -2.9167327880859375, + "logits/rejected": -2.827451229095459, + "logps/chosen": -97.91780090332031, + "logps/rejected": -1008.7288208007812, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23241129517555237, + "rewards/margins": 9.407768249511719, + "rewards/rejected": -9.640179634094238, + "step": 3130 + }, + { + "epoch": 0.19, + "learning_rate": 4.88497598385103e-06, + "logits/chosen": -2.861631393432617, + "logits/rejected": -2.7636709213256836, + "logps/chosen": -90.74644470214844, + "logps/rejected": -1004.3260498046875, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24270197749137878, + "rewards/margins": 9.349315643310547, + "rewards/rejected": -9.592016220092773, + "step": 3140 + }, + { + "epoch": 0.19, + "learning_rate": 4.883410548325083e-06, + "logits/chosen": -2.9055678844451904, + "logits/rejected": -2.804964303970337, + "logps/chosen": -89.72052764892578, + "logps/rejected": -1050.116455078125, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1484868973493576, + "rewards/margins": 9.905959129333496, + "rewards/rejected": -10.054445266723633, + "step": 3150 + }, + { + "epoch": 0.19, + "learning_rate": 4.881834786438183e-06, + "logits/chosen": -2.8878417015075684, + "logits/rejected": -2.828029155731201, + "logps/chosen": -64.44745635986328, + "logps/rejected": -916.2906494140625, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08098606765270233, + "rewards/margins": 8.80667781829834, + "rewards/rejected": -8.725691795349121, + "step": 3160 + }, + { + "epoch": 0.19, + "learning_rate": 4.880248705017472e-06, + "logits/chosen": -2.929187774658203, + "logits/rejected": -2.8213164806365967, + "logps/chosen": -75.32865905761719, + "logps/rejected": -940.2420043945312, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10205451399087906, + "rewards/margins": 8.863107681274414, + "rewards/rejected": -8.96516227722168, + "step": 3170 + }, + { + "epoch": 0.19, + "learning_rate": 4.878652310934804e-06, + "logits/chosen": -2.9131598472595215, + "logits/rejected": -2.7929470539093018, + "logps/chosen": -99.04710388183594, + "logps/rejected": -1051.5531005859375, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.302415132522583, + "rewards/margins": 9.769061088562012, + "rewards/rejected": -10.071475982666016, + "step": 3180 + }, + { + "epoch": 0.19, + "learning_rate": 4.877045611106715e-06, + "logits/chosen": -2.9377360343933105, + "logits/rejected": -2.8415417671203613, + "logps/chosen": -113.6156005859375, + "logps/rejected": -1036.766845703125, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3979889452457428, + "rewards/margins": 9.514283180236816, + "rewards/rejected": -9.912271499633789, + "step": 3190 + }, + { + "epoch": 0.19, + "learning_rate": 4.8754286124943885e-06, + "logits/chosen": -2.9039368629455566, + "logits/rejected": -2.813891887664795, + "logps/chosen": -100.67926788330078, + "logps/rejected": -997.3171997070312, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2190081626176834, + "rewards/margins": 9.310359954833984, + "rewards/rejected": -9.52936840057373, + "step": 3200 + }, + { + "epoch": 0.19, + "learning_rate": 4.873801322103632e-06, + "logits/chosen": -2.900237798690796, + "logits/rejected": -2.7764649391174316, + "logps/chosen": -73.33778381347656, + "logps/rejected": -1124.5582275390625, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0386403426527977, + "rewards/margins": 10.754034996032715, + "rewards/rejected": -10.792677879333496, + "step": 3210 + }, + { + "epoch": 0.19, + "learning_rate": 4.872163746984839e-06, + "logits/chosen": -2.866896152496338, + "logits/rejected": -2.789792537689209, + "logps/chosen": -77.7817611694336, + "logps/rejected": -1108.5367431640625, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11586730182170868, + "rewards/margins": 10.520917892456055, + "rewards/rejected": -10.636785507202148, + "step": 3220 + }, + { + "epoch": 0.19, + "learning_rate": 4.8705158942329676e-06, + "logits/chosen": -2.9006447792053223, + "logits/rejected": -2.8281712532043457, + "logps/chosen": -97.91754150390625, + "logps/rejected": -934.1824340820312, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2949380874633789, + "rewards/margins": 8.605655670166016, + "rewards/rejected": -8.900593757629395, + "step": 3230 + }, + { + "epoch": 0.19, + "learning_rate": 4.8688577709875015e-06, + "logits/chosen": -2.896557569503784, + "logits/rejected": -2.8031506538391113, + "logps/chosen": -85.1817398071289, + "logps/rejected": -871.4775390625, + "loss": 0.0312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1452803909778595, + "rewards/margins": 8.143285751342773, + "rewards/rejected": -8.288567543029785, + "step": 3240 + }, + { + "epoch": 0.19, + "learning_rate": 4.8671893844324215e-06, + "logits/chosen": -2.8887970447540283, + "logits/rejected": -2.7693824768066406, + "logps/chosen": -91.21710968017578, + "logps/rejected": -1113.0513916015625, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19352278113365173, + "rewards/margins": 10.491316795349121, + "rewards/rejected": -10.684839248657227, + "step": 3250 + }, + { + "epoch": 0.19, + "learning_rate": 4.865510741796178e-06, + "logits/chosen": -2.9023499488830566, + "logits/rejected": -2.8091979026794434, + "logps/chosen": -126.04353332519531, + "logps/rejected": -1067.5782470703125, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5405257940292358, + "rewards/margins": 9.677751541137695, + "rewards/rejected": -10.218276023864746, + "step": 3260 + }, + { + "epoch": 0.19, + "learning_rate": 4.863821850351655e-06, + "logits/chosen": -2.8556227684020996, + "logits/rejected": -2.7804763317108154, + "logps/chosen": -105.49711608886719, + "logps/rejected": -1011.6388549804688, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37187138199806213, + "rewards/margins": 9.298276901245117, + "rewards/rejected": -9.670148849487305, + "step": 3270 + }, + { + "epoch": 0.2, + "learning_rate": 4.862122717416142e-06, + "logits/chosen": -2.8979685306549072, + "logits/rejected": -2.756927013397217, + "logps/chosen": -97.33106994628906, + "logps/rejected": -1057.734375, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22641482949256897, + "rewards/margins": 9.916643142700195, + "rewards/rejected": -10.143056869506836, + "step": 3280 + }, + { + "epoch": 0.2, + "learning_rate": 4.860413350351299e-06, + "logits/chosen": -2.9265663623809814, + "logits/rejected": -2.8173117637634277, + "logps/chosen": -117.81965637207031, + "logps/rejected": -1057.960205078125, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5091457962989807, + "rewards/margins": 9.628435134887695, + "rewards/rejected": -10.137581825256348, + "step": 3290 + }, + { + "epoch": 0.2, + "learning_rate": 4.8586937565631265e-06, + "logits/chosen": -2.8908934593200684, + "logits/rejected": -2.7778306007385254, + "logps/chosen": -102.55086517333984, + "logps/rejected": -1013.2619018554688, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29052549600601196, + "rewards/margins": 9.392009735107422, + "rewards/rejected": -9.682535171508789, + "step": 3300 + }, + { + "epoch": 0.2, + "learning_rate": 4.856963943501935e-06, + "logits/chosen": -2.896718740463257, + "logits/rejected": -2.7803893089294434, + "logps/chosen": -121.4454116821289, + "logps/rejected": -1098.3824462890625, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5048161745071411, + "rewards/margins": 10.032444953918457, + "rewards/rejected": -10.537260055541992, + "step": 3310 + }, + { + "epoch": 0.2, + "learning_rate": 4.85522391866231e-06, + "logits/chosen": -2.920180559158325, + "logits/rejected": -2.8442211151123047, + "logps/chosen": -118.40858459472656, + "logps/rejected": -1077.216552734375, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.511841893196106, + "rewards/margins": 9.819600105285645, + "rewards/rejected": -10.331441879272461, + "step": 3320 + }, + { + "epoch": 0.2, + "learning_rate": 4.85347368958308e-06, + "logits/chosen": -2.9260549545288086, + "logits/rejected": -2.8104348182678223, + "logps/chosen": -81.95069885253906, + "logps/rejected": -992.25732421875, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09835229814052582, + "rewards/margins": 9.383533477783203, + "rewards/rejected": -9.48188591003418, + "step": 3330 + }, + { + "epoch": 0.2, + "learning_rate": 4.8517132638472845e-06, + "logits/chosen": -2.900827407836914, + "logits/rejected": -2.8186659812927246, + "logps/chosen": -82.92295837402344, + "logps/rejected": -930.4632568359375, + "loss": 0.0366, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1473492532968521, + "rewards/margins": 8.729012489318848, + "rewards/rejected": -8.876360893249512, + "step": 3340 + }, + { + "epoch": 0.2, + "learning_rate": 4.849942649082143e-06, + "logits/chosen": -2.878818988800049, + "logits/rejected": -2.775176525115967, + "logps/chosen": -75.95471954345703, + "logps/rejected": -954.2730712890625, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07086614519357681, + "rewards/margins": 9.017777442932129, + "rewards/rejected": -9.088644027709961, + "step": 3350 + }, + { + "epoch": 0.2, + "learning_rate": 4.848161852959016e-06, + "logits/chosen": -2.920950412750244, + "logits/rejected": -2.8400120735168457, + "logps/chosen": -66.54155731201172, + "logps/rejected": -1020.5328979492188, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023877177387475967, + "rewards/margins": 9.787919998168945, + "rewards/rejected": -9.764042854309082, + "step": 3360 + }, + { + "epoch": 0.2, + "learning_rate": 4.84637088319338e-06, + "logits/chosen": -2.933952808380127, + "logits/rejected": -2.819856882095337, + "logps/chosen": -65.50806427001953, + "logps/rejected": -953.2479248046875, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0579400435090065, + "rewards/margins": 9.150853157043457, + "rewards/rejected": -9.092912673950195, + "step": 3370 + }, + { + "epoch": 0.2, + "learning_rate": 4.844569747544788e-06, + "logits/chosen": -2.9174880981445312, + "logits/rejected": -2.8065009117126465, + "logps/chosen": -70.78478240966797, + "logps/rejected": -1098.2421875, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018338222056627274, + "rewards/margins": 10.509620666503906, + "rewards/rejected": -10.527958869934082, + "step": 3380 + }, + { + "epoch": 0.2, + "learning_rate": 4.842758453816836e-06, + "logits/chosen": -2.9255166053771973, + "logits/rejected": -2.8217504024505615, + "logps/chosen": -79.82255554199219, + "logps/rejected": -989.0732421875, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11151568591594696, + "rewards/margins": 9.333974838256836, + "rewards/rejected": -9.445490837097168, + "step": 3390 + }, + { + "epoch": 0.2, + "learning_rate": 4.840937009857134e-06, + "logits/chosen": -2.9049763679504395, + "logits/rejected": -2.763141393661499, + "logps/chosen": -92.43160247802734, + "logps/rejected": -1064.4652099609375, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23361711204051971, + "rewards/margins": 9.972365379333496, + "rewards/rejected": -10.20598316192627, + "step": 3400 + }, + { + "epoch": 0.2, + "learning_rate": 4.839105423557266e-06, + "logits/chosen": -2.8952553272247314, + "logits/rejected": -2.7982900142669678, + "logps/chosen": -104.31312561035156, + "logps/rejected": -1009.9908447265625, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3694721460342407, + "rewards/margins": 9.28325080871582, + "rewards/rejected": -9.65272331237793, + "step": 3410 + }, + { + "epoch": 0.2, + "learning_rate": 4.8372637028527615e-06, + "logits/chosen": -2.9060323238372803, + "logits/rejected": -2.815075635910034, + "logps/chosen": -87.04627990722656, + "logps/rejected": -1043.029052734375, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11950896680355072, + "rewards/margins": 9.856620788574219, + "rewards/rejected": -9.976129531860352, + "step": 3420 + }, + { + "epoch": 0.2, + "learning_rate": 4.835411855723056e-06, + "logits/chosen": -2.9083447456359863, + "logits/rejected": -2.8016884326934814, + "logps/chosen": -99.11207580566406, + "logps/rejected": -882.466796875, + "loss": 0.0532, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.32335492968559265, + "rewards/margins": 8.073678970336914, + "rewards/rejected": -8.39703369140625, + "step": 3430 + }, + { + "epoch": 0.21, + "learning_rate": 4.83354989019146e-06, + "logits/chosen": -2.8986876010894775, + "logits/rejected": -2.777966260910034, + "logps/chosen": -98.28465270996094, + "logps/rejected": -982.7640380859375, + "loss": 0.0527, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2675360441207886, + "rewards/margins": 9.127363204956055, + "rewards/rejected": -9.394899368286133, + "step": 3440 + }, + { + "epoch": 0.21, + "learning_rate": 4.831677814325122e-06, + "logits/chosen": -2.9343008995056152, + "logits/rejected": -2.796949863433838, + "logps/chosen": -135.0299530029297, + "logps/rejected": -1021.5451049804688, + "loss": 0.1024, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6532796621322632, + "rewards/margins": 9.130407333374023, + "rewards/rejected": -9.783686637878418, + "step": 3450 + }, + { + "epoch": 0.21, + "learning_rate": 4.8297956362349955e-06, + "logits/chosen": -2.911616802215576, + "logits/rejected": -2.7802021503448486, + "logps/chosen": -190.67584228515625, + "logps/rejected": -1014.8893432617188, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2101691961288452, + "rewards/margins": 8.506454467773438, + "rewards/rejected": -9.716622352600098, + "step": 3460 + }, + { + "epoch": 0.21, + "learning_rate": 4.8279033640758026e-06, + "logits/chosen": -2.909989833831787, + "logits/rejected": -2.8176238536834717, + "logps/chosen": -105.54266357421875, + "logps/rejected": -941.88037109375, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35250407457351685, + "rewards/margins": 8.61872386932373, + "rewards/rejected": -8.971227645874023, + "step": 3470 + }, + { + "epoch": 0.21, + "learning_rate": 4.826001006045997e-06, + "logits/chosen": -2.9218358993530273, + "logits/rejected": -2.858625888824463, + "logps/chosen": -77.50524139404297, + "logps/rejected": -921.4065551757812, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07038109004497528, + "rewards/margins": 8.711301803588867, + "rewards/rejected": -8.781682968139648, + "step": 3480 + }, + { + "epoch": 0.21, + "learning_rate": 4.824088570387735e-06, + "logits/chosen": -2.9038586616516113, + "logits/rejected": -2.821878433227539, + "logps/chosen": -126.85333251953125, + "logps/rejected": -932.4591674804688, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5707005858421326, + "rewards/margins": 8.309720993041992, + "rewards/rejected": -8.880422592163086, + "step": 3490 + }, + { + "epoch": 0.21, + "learning_rate": 4.822166065386832e-06, + "logits/chosen": -2.919581174850464, + "logits/rejected": -2.833155632019043, + "logps/chosen": -100.0995864868164, + "logps/rejected": -1006.1409912109375, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31064051389694214, + "rewards/margins": 9.319631576538086, + "rewards/rejected": -9.63027286529541, + "step": 3500 + }, + { + "epoch": 0.21, + "learning_rate": 4.820233499372728e-06, + "logits/chosen": -2.8938403129577637, + "logits/rejected": -2.7597873210906982, + "logps/chosen": -150.28883361816406, + "logps/rejected": -1096.629638671875, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7687881588935852, + "rewards/margins": 9.742793083190918, + "rewards/rejected": -10.511579513549805, + "step": 3510 + }, + { + "epoch": 0.21, + "learning_rate": 4.8182908807184585e-06, + "logits/chosen": -2.9124953746795654, + "logits/rejected": -2.7926323413848877, + "logps/chosen": -143.35337829589844, + "logps/rejected": -1126.3023681640625, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6865848302841187, + "rewards/margins": 10.126398086547852, + "rewards/rejected": -10.812982559204102, + "step": 3520 + }, + { + "epoch": 0.21, + "learning_rate": 4.816338217840607e-06, + "logits/chosen": -2.9210238456726074, + "logits/rejected": -2.838343620300293, + "logps/chosen": -76.27590942382812, + "logps/rejected": -886.6156005859375, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10457025468349457, + "rewards/margins": 8.311971664428711, + "rewards/rejected": -8.416543006896973, + "step": 3530 + }, + { + "epoch": 0.21, + "learning_rate": 4.814375519199281e-06, + "logits/chosen": -2.91465163230896, + "logits/rejected": -2.770555257797241, + "logps/chosen": -84.92073822021484, + "logps/rejected": -1105.976318359375, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18526974320411682, + "rewards/margins": 10.429521560668945, + "rewards/rejected": -10.614790916442871, + "step": 3540 + }, + { + "epoch": 0.21, + "learning_rate": 4.812402793298063e-06, + "logits/chosen": -2.902050733566284, + "logits/rejected": -2.800664186477661, + "logps/chosen": -97.14655303955078, + "logps/rejected": -958.6209716796875, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2618294358253479, + "rewards/margins": 8.886330604553223, + "rewards/rejected": -9.148159980773926, + "step": 3550 + }, + { + "epoch": 0.21, + "learning_rate": 4.810420048683985e-06, + "logits/chosen": -2.9267497062683105, + "logits/rejected": -2.8296542167663574, + "logps/chosen": -98.052001953125, + "logps/rejected": -1111.1044921875, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30213773250579834, + "rewards/margins": 10.367947578430176, + "rewards/rejected": -10.670086860656738, + "step": 3560 + }, + { + "epoch": 0.21, + "learning_rate": 4.808427293947481e-06, + "logits/chosen": -2.9082841873168945, + "logits/rejected": -2.8234333992004395, + "logps/chosen": -93.5290756225586, + "logps/rejected": -1032.8382568359375, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24832916259765625, + "rewards/margins": 9.64242172241211, + "rewards/rejected": -9.89074993133545, + "step": 3570 + }, + { + "epoch": 0.21, + "learning_rate": 4.806424537722359e-06, + "logits/chosen": -2.9553589820861816, + "logits/rejected": -2.793161630630493, + "logps/chosen": -65.16587829589844, + "logps/rejected": -999.2928466796875, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013816917315125465, + "rewards/margins": 9.536550521850586, + "rewards/rejected": -9.55036735534668, + "step": 3580 + }, + { + "epoch": 0.21, + "learning_rate": 4.804411788685755e-06, + "logits/chosen": -2.9300644397735596, + "logits/rejected": -2.81620192527771, + "logps/chosen": -67.64566040039062, + "logps/rejected": -1021.4599609375, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005144655704498291, + "rewards/margins": 9.7776460647583, + "rewards/rejected": -9.772500991821289, + "step": 3590 + }, + { + "epoch": 0.21, + "learning_rate": 4.802389055558105e-06, + "logits/chosen": -2.901089906692505, + "logits/rejected": -2.7751426696777344, + "logps/chosen": -81.9259033203125, + "logps/rejected": -1056.336181640625, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1204097643494606, + "rewards/margins": 9.990921974182129, + "rewards/rejected": -10.11133098602295, + "step": 3600 + }, + { + "epoch": 0.22, + "learning_rate": 4.8003563471030974e-06, + "logits/chosen": -2.882105588912964, + "logits/rejected": -2.8083081245422363, + "logps/chosen": -99.86497497558594, + "logps/rejected": -962.1526489257812, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2984747290611267, + "rewards/margins": 8.889835357666016, + "rewards/rejected": -9.188310623168945, + "step": 3610 + }, + { + "epoch": 0.22, + "learning_rate": 4.7983136721276435e-06, + "logits/chosen": -2.927910566329956, + "logits/rejected": -2.8260843753814697, + "logps/chosen": -81.40794372558594, + "logps/rejected": -1044.443603515625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0779222697019577, + "rewards/margins": 9.905133247375488, + "rewards/rejected": -9.983054161071777, + "step": 3620 + }, + { + "epoch": 0.22, + "learning_rate": 4.796261039481833e-06, + "logits/chosen": -2.9126973152160645, + "logits/rejected": -2.7857346534729004, + "logps/chosen": -100.78450012207031, + "logps/rejected": -987.5748291015625, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2860509753227234, + "rewards/margins": 9.147804260253906, + "rewards/rejected": -9.433855056762695, + "step": 3630 + }, + { + "epoch": 0.22, + "learning_rate": 4.7941984580589e-06, + "logits/chosen": -2.916168689727783, + "logits/rejected": -2.810474157333374, + "logps/chosen": -153.5299835205078, + "logps/rejected": -962.2683715820312, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.86139315366745, + "rewards/margins": 8.309381484985352, + "rewards/rejected": -9.170774459838867, + "step": 3640 + }, + { + "epoch": 0.22, + "learning_rate": 4.7921259367951804e-06, + "logits/chosen": -2.8897111415863037, + "logits/rejected": -2.8101799488067627, + "logps/chosen": -132.01263427734375, + "logps/rejected": -1016.0870971679688, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.602690577507019, + "rewards/margins": 9.103035926818848, + "rewards/rejected": -9.70572566986084, + "step": 3650 + }, + { + "epoch": 0.22, + "learning_rate": 4.790043484670077e-06, + "logits/chosen": -2.914196014404297, + "logits/rejected": -2.8064537048339844, + "logps/chosen": -145.84518432617188, + "logps/rejected": -1150.814208984375, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8015564680099487, + "rewards/margins": 10.246492385864258, + "rewards/rejected": -11.048049926757812, + "step": 3660 + }, + { + "epoch": 0.22, + "learning_rate": 4.787951110706019e-06, + "logits/chosen": -2.907914161682129, + "logits/rejected": -2.8049521446228027, + "logps/chosen": -99.94340515136719, + "logps/rejected": -1061.523681640625, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2665363848209381, + "rewards/margins": 9.910881042480469, + "rewards/rejected": -10.177417755126953, + "step": 3670 + }, + { + "epoch": 0.22, + "learning_rate": 4.785848823968424e-06, + "logits/chosen": -2.941972017288208, + "logits/rejected": -2.819143533706665, + "logps/chosen": -92.7989501953125, + "logps/rejected": -1125.849365234375, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18092337250709534, + "rewards/margins": 10.625335693359375, + "rewards/rejected": -10.806259155273438, + "step": 3680 + }, + { + "epoch": 0.22, + "learning_rate": 4.783736633565654e-06, + "logits/chosen": -2.919088840484619, + "logits/rejected": -2.8095946311950684, + "logps/chosen": -89.71275329589844, + "logps/rejected": -1106.349853515625, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18761476874351501, + "rewards/margins": 10.43260669708252, + "rewards/rejected": -10.620222091674805, + "step": 3690 + }, + { + "epoch": 0.22, + "learning_rate": 4.781614548648983e-06, + "logits/chosen": -2.8606014251708984, + "logits/rejected": -2.735283613204956, + "logps/chosen": -87.48583984375, + "logps/rejected": -973.7822265625, + "loss": 0.0802, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17278487980365753, + "rewards/margins": 9.119610786437988, + "rewards/rejected": -9.292396545410156, + "step": 3700 + }, + { + "epoch": 0.22, + "learning_rate": 4.779482578412553e-06, + "logits/chosen": -2.9025392532348633, + "logits/rejected": -2.803652286529541, + "logps/chosen": -83.37914276123047, + "logps/rejected": -1047.5526123046875, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14480479061603546, + "rewards/margins": 9.883936882019043, + "rewards/rejected": -10.028741836547852, + "step": 3710 + }, + { + "epoch": 0.22, + "learning_rate": 4.7773407320933345e-06, + "logits/chosen": -2.918300151824951, + "logits/rejected": -2.7781999111175537, + "logps/chosen": -108.9967041015625, + "logps/rejected": -1034.0489501953125, + "loss": 0.0522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4233720898628235, + "rewards/margins": 9.476350784301758, + "rewards/rejected": -9.8997220993042, + "step": 3720 + }, + { + "epoch": 0.22, + "learning_rate": 4.775189018971088e-06, + "logits/chosen": -2.9265215396881104, + "logits/rejected": -2.813169002532959, + "logps/chosen": -95.07650756835938, + "logps/rejected": -1022.4942626953125, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2539520263671875, + "rewards/margins": 9.519453048706055, + "rewards/rejected": -9.773405075073242, + "step": 3730 + }, + { + "epoch": 0.22, + "learning_rate": 4.773027448368323e-06, + "logits/chosen": -2.8871707916259766, + "logits/rejected": -2.789888381958008, + "logps/chosen": -96.50606536865234, + "logps/rejected": -993.6654052734375, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.265226274728775, + "rewards/margins": 9.223313331604004, + "rewards/rejected": -9.488539695739746, + "step": 3740 + }, + { + "epoch": 0.22, + "learning_rate": 4.770856029650257e-06, + "logits/chosen": -2.8960018157958984, + "logits/rejected": -2.804370403289795, + "logps/chosen": -92.79429626464844, + "logps/rejected": -995.08056640625, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.248972088098526, + "rewards/margins": 9.265229225158691, + "rewards/rejected": -9.514201164245605, + "step": 3750 + }, + { + "epoch": 0.22, + "learning_rate": 4.768674772224775e-06, + "logits/chosen": -2.9093520641326904, + "logits/rejected": -2.789581537246704, + "logps/chosen": -87.56285095214844, + "logps/rejected": -1031.72119140625, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1927683800458908, + "rewards/margins": 9.660600662231445, + "rewards/rejected": -9.85336971282959, + "step": 3760 + }, + { + "epoch": 0.22, + "learning_rate": 4.766483685542389e-06, + "logits/chosen": -2.9082770347595215, + "logits/rejected": -2.8302905559539795, + "logps/chosen": -65.1877212524414, + "logps/rejected": -918.7218017578125, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01386135071516037, + "rewards/margins": 8.758512496948242, + "rewards/rejected": -8.744649887084961, + "step": 3770 + }, + { + "epoch": 0.23, + "learning_rate": 4.764282779096199e-06, + "logits/chosen": -2.894007444381714, + "logits/rejected": -2.8101418018341064, + "logps/chosen": -72.62544250488281, + "logps/rejected": -973.9033203125, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019312819465994835, + "rewards/margins": 9.27327823638916, + "rewards/rejected": -9.292590141296387, + "step": 3780 + }, + { + "epoch": 0.23, + "learning_rate": 4.762072062421849e-06, + "logits/chosen": -2.902066230773926, + "logits/rejected": -2.796579122543335, + "logps/chosen": -68.55062103271484, + "logps/rejected": -981.8231201171875, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021280916407704353, + "rewards/margins": 9.347227096557617, + "rewards/rejected": -9.368507385253906, + "step": 3790 + }, + { + "epoch": 0.23, + "learning_rate": 4.759851545097486e-06, + "logits/chosen": -2.927185535430908, + "logits/rejected": -2.8071682453155518, + "logps/chosen": -94.28609466552734, + "logps/rejected": -1121.700439453125, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21990816295146942, + "rewards/margins": 10.542993545532227, + "rewards/rejected": -10.762903213500977, + "step": 3800 + }, + { + "epoch": 0.23, + "learning_rate": 4.75762123674372e-06, + "logits/chosen": -2.905829429626465, + "logits/rejected": -2.7959067821502686, + "logps/chosen": -113.93766784667969, + "logps/rejected": -1116.533203125, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45689186453819275, + "rewards/margins": 10.273436546325684, + "rewards/rejected": -10.730328559875488, + "step": 3810 + }, + { + "epoch": 0.23, + "learning_rate": 4.755381147023582e-06, + "logits/chosen": -2.88506817817688, + "logits/rejected": -2.785269260406494, + "logps/chosen": -77.04222106933594, + "logps/rejected": -931.8571166992188, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10545346885919571, + "rewards/margins": 8.777097702026367, + "rewards/rejected": -8.882551193237305, + "step": 3820 + }, + { + "epoch": 0.23, + "learning_rate": 4.7531312856424814e-06, + "logits/chosen": -2.935608148574829, + "logits/rejected": -2.8553714752197266, + "logps/chosen": -71.35791015625, + "logps/rejected": -902.74267578125, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005191388539969921, + "rewards/margins": 8.57547378540039, + "rewards/rejected": -8.58066463470459, + "step": 3830 + }, + { + "epoch": 0.23, + "learning_rate": 4.750871662348164e-06, + "logits/chosen": -2.9029181003570557, + "logits/rejected": -2.8173446655273438, + "logps/chosen": -75.56688690185547, + "logps/rejected": -913.6727294921875, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02877405285835266, + "rewards/margins": 8.667641639709473, + "rewards/rejected": -8.696414947509766, + "step": 3840 + }, + { + "epoch": 0.23, + "learning_rate": 4.748602286930671e-06, + "logits/chosen": -2.8777687549591064, + "logits/rejected": -2.769540309906006, + "logps/chosen": -68.40373992919922, + "logps/rejected": -1035.921875, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.001151949167251587, + "rewards/margins": 9.908658981323242, + "rewards/rejected": -9.907508850097656, + "step": 3850 + }, + { + "epoch": 0.23, + "learning_rate": 4.746323169222295e-06, + "logits/chosen": -2.884075880050659, + "logits/rejected": -2.7866289615631104, + "logps/chosen": -74.61378479003906, + "logps/rejected": -900.3932495117188, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.044218212366104126, + "rewards/margins": 8.607454299926758, + "rewards/rejected": -8.563236236572266, + "step": 3860 + }, + { + "epoch": 0.23, + "learning_rate": 4.744034319097536e-06, + "logits/chosen": -2.9046847820281982, + "logits/rejected": -2.810368776321411, + "logps/chosen": -75.18858337402344, + "logps/rejected": -1058.2366943359375, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014871361665427685, + "rewards/margins": 10.108458518981934, + "rewards/rejected": -10.123331069946289, + "step": 3870 + }, + { + "epoch": 0.23, + "learning_rate": 4.741735746473063e-06, + "logits/chosen": -2.9184045791625977, + "logits/rejected": -2.795245885848999, + "logps/chosen": -69.85333251953125, + "logps/rejected": -994.2190551757812, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.050799496471881866, + "rewards/margins": 9.447065353393555, + "rewards/rejected": -9.497864723205566, + "step": 3880 + }, + { + "epoch": 0.23, + "learning_rate": 4.739427461307671e-06, + "logits/chosen": -2.8918087482452393, + "logits/rejected": -2.7953124046325684, + "logps/chosen": -78.41696166992188, + "logps/rejected": -944.1104736328125, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.052492789924144745, + "rewards/margins": 8.951055526733398, + "rewards/rejected": -9.003546714782715, + "step": 3890 + }, + { + "epoch": 0.23, + "learning_rate": 4.73710947360223e-06, + "logits/chosen": -2.8873825073242188, + "logits/rejected": -2.7540669441223145, + "logps/chosen": -96.78419494628906, + "logps/rejected": -975.72412109375, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23091724514961243, + "rewards/margins": 9.077892303466797, + "rewards/rejected": -9.308808326721191, + "step": 3900 + }, + { + "epoch": 0.23, + "learning_rate": 4.734781793399651e-06, + "logits/chosen": -2.9157841205596924, + "logits/rejected": -2.7792770862579346, + "logps/chosen": -85.96147155761719, + "logps/rejected": -1089.6395263671875, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13481441140174866, + "rewards/margins": 10.323954582214355, + "rewards/rejected": -10.458769798278809, + "step": 3910 + }, + { + "epoch": 0.23, + "learning_rate": 4.732444430784838e-06, + "logits/chosen": -2.902320623397827, + "logits/rejected": -2.7530972957611084, + "logps/chosen": -84.19400787353516, + "logps/rejected": -1082.6468505859375, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07995015382766724, + "rewards/margins": 10.297528266906738, + "rewards/rejected": -10.37747859954834, + "step": 3920 + }, + { + "epoch": 0.23, + "learning_rate": 4.730097395884645e-06, + "logits/chosen": -2.935899496078491, + "logits/rejected": -2.7937304973602295, + "logps/chosen": -78.46994018554688, + "logps/rejected": -961.6571044921875, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08862430602312088, + "rewards/margins": 9.097587585449219, + "rewards/rejected": -9.186212539672852, + "step": 3930 + }, + { + "epoch": 0.23, + "learning_rate": 4.727740698867831e-06, + "logits/chosen": -2.900052309036255, + "logits/rejected": -2.8231678009033203, + "logps/chosen": -87.9262924194336, + "logps/rejected": -1028.0103759765625, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1742534339427948, + "rewards/margins": 9.663190841674805, + "rewards/rejected": -9.837444305419922, + "step": 3940 + }, + { + "epoch": 0.24, + "learning_rate": 4.725374349945019e-06, + "logits/chosen": -2.9103264808654785, + "logits/rejected": -2.8139142990112305, + "logps/chosen": -107.99403381347656, + "logps/rejected": -1099.698974609375, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37500011920928955, + "rewards/margins": 10.16627311706543, + "rewards/rejected": -10.541272163391113, + "step": 3950 + }, + { + "epoch": 0.24, + "learning_rate": 4.7229983593686465e-06, + "logits/chosen": -2.899981737136841, + "logits/rejected": -2.7954936027526855, + "logps/chosen": -82.14442443847656, + "logps/rejected": -1063.176025390625, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11459366232156754, + "rewards/margins": 10.062212944030762, + "rewards/rejected": -10.176806449890137, + "step": 3960 + }, + { + "epoch": 0.24, + "learning_rate": 4.72061273743293e-06, + "logits/chosen": -2.9029393196105957, + "logits/rejected": -2.8007359504699707, + "logps/chosen": -93.69132995605469, + "logps/rejected": -1082.078125, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24225322902202606, + "rewards/margins": 10.138811111450195, + "rewards/rejected": -10.381063461303711, + "step": 3970 + }, + { + "epoch": 0.24, + "learning_rate": 4.718217494473809e-06, + "logits/chosen": -2.8980863094329834, + "logits/rejected": -2.782282590866089, + "logps/chosen": -76.29203033447266, + "logps/rejected": -1009.2958984375, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09314502775669098, + "rewards/margins": 9.557165145874023, + "rewards/rejected": -9.650311470031738, + "step": 3980 + }, + { + "epoch": 0.24, + "learning_rate": 4.715812640868911e-06, + "logits/chosen": -2.941286087036133, + "logits/rejected": -2.8018040657043457, + "logps/chosen": -83.56819915771484, + "logps/rejected": -1039.623779296875, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02263762429356575, + "rewards/margins": 9.941400527954102, + "rewards/rejected": -9.964037895202637, + "step": 3990 + }, + { + "epoch": 0.24, + "learning_rate": 4.7133981870375e-06, + "logits/chosen": -2.9263908863067627, + "logits/rejected": -2.8119800090789795, + "logps/chosen": -94.2664794921875, + "logps/rejected": -1165.2017822265625, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19283881783485413, + "rewards/margins": 11.018218040466309, + "rewards/rejected": -11.21105670928955, + "step": 4000 + }, + { + "epoch": 0.24, + "learning_rate": 4.710974143440435e-06, + "logits/chosen": -2.916626453399658, + "logits/rejected": -2.825366258621216, + "logps/chosen": -79.94502258300781, + "logps/rejected": -1028.0797119140625, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13081735372543335, + "rewards/margins": 9.698927879333496, + "rewards/rejected": -9.829744338989258, + "step": 4010 + }, + { + "epoch": 0.24, + "learning_rate": 4.708540520580125e-06, + "logits/chosen": -2.9402849674224854, + "logits/rejected": -2.8051180839538574, + "logps/chosen": -74.67461395263672, + "logps/rejected": -1090.4091796875, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009149352088570595, + "rewards/margins": 10.44430923461914, + "rewards/rejected": -10.453458786010742, + "step": 4020 + }, + { + "epoch": 0.24, + "learning_rate": 4.70609732900048e-06, + "logits/chosen": -2.905668020248413, + "logits/rejected": -2.803889751434326, + "logps/chosen": -69.18238830566406, + "logps/rejected": -1125.2408447265625, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05240710452198982, + "rewards/margins": 10.851551055908203, + "rewards/rejected": -10.799144744873047, + "step": 4030 + }, + { + "epoch": 0.24, + "learning_rate": 4.703644579286867e-06, + "logits/chosen": -2.8862783908843994, + "logits/rejected": -2.765496253967285, + "logps/chosen": -69.36138916015625, + "logps/rejected": -1107.096435546875, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03343920782208443, + "rewards/margins": 10.67336654663086, + "rewards/rejected": -10.63992691040039, + "step": 4040 + }, + { + "epoch": 0.24, + "learning_rate": 4.701182282066068e-06, + "logits/chosen": -2.9192187786102295, + "logits/rejected": -2.7758922576904297, + "logps/chosen": -77.3496322631836, + "logps/rejected": -1172.750732421875, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03041331097483635, + "rewards/margins": 11.249832153320312, + "rewards/rejected": -11.28024673461914, + "step": 4050 + }, + { + "epoch": 0.24, + "learning_rate": 4.698710448006226e-06, + "logits/chosen": -2.9110758304595947, + "logits/rejected": -2.805079698562622, + "logps/chosen": -68.88075256347656, + "logps/rejected": -1095.01123046875, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07891975343227386, + "rewards/margins": 10.428773880004883, + "rewards/rejected": -10.50769329071045, + "step": 4060 + }, + { + "epoch": 0.24, + "learning_rate": 4.696229087816808e-06, + "logits/chosen": -2.905869245529175, + "logits/rejected": -2.8004088401794434, + "logps/chosen": -71.49755859375, + "logps/rejected": -1070.1978759765625, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01815054751932621, + "rewards/margins": 10.243074417114258, + "rewards/rejected": -10.261224746704102, + "step": 4070 + }, + { + "epoch": 0.24, + "learning_rate": 4.693738212248549e-06, + "logits/chosen": -2.896515369415283, + "logits/rejected": -2.7848098278045654, + "logps/chosen": -103.5752182006836, + "logps/rejected": -1010.2786865234375, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33002567291259766, + "rewards/margins": 9.334737777709961, + "rewards/rejected": -9.664763450622559, + "step": 4080 + }, + { + "epoch": 0.24, + "learning_rate": 4.6912378320934134e-06, + "logits/chosen": -2.8731987476348877, + "logits/rejected": -2.7809576988220215, + "logps/chosen": -70.00006866455078, + "logps/rejected": -992.94287109375, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08050527423620224, + "rewards/margins": 9.564136505126953, + "rewards/rejected": -9.483631134033203, + "step": 4090 + }, + { + "epoch": 0.24, + "learning_rate": 4.688727958184545e-06, + "logits/chosen": -2.913094997406006, + "logits/rejected": -2.818051815032959, + "logps/chosen": -98.42679595947266, + "logps/rejected": -1067.169677734375, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2496531903743744, + "rewards/margins": 9.977968215942383, + "rewards/rejected": -10.227621078491211, + "step": 4100 + }, + { + "epoch": 0.25, + "learning_rate": 4.68620860139622e-06, + "logits/chosen": -2.8749492168426514, + "logits/rejected": -2.7749624252319336, + "logps/chosen": -110.8221206665039, + "logps/rejected": -1056.250244140625, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3982095718383789, + "rewards/margins": 9.721628189086914, + "rewards/rejected": -10.119839668273926, + "step": 4110 + }, + { + "epoch": 0.25, + "learning_rate": 4.683679772643799e-06, + "logits/chosen": -2.8999485969543457, + "logits/rejected": -2.801888942718506, + "logps/chosen": -92.64431762695312, + "logps/rejected": -1134.45068359375, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26071274280548096, + "rewards/margins": 10.631994247436523, + "rewards/rejected": -10.892707824707031, + "step": 4120 + }, + { + "epoch": 0.25, + "learning_rate": 4.681141482883682e-06, + "logits/chosen": -2.877768039703369, + "logits/rejected": -2.8172197341918945, + "logps/chosen": -109.5040283203125, + "logps/rejected": -985.7999877929688, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3617154061794281, + "rewards/margins": 9.049224853515625, + "rewards/rejected": -9.410940170288086, + "step": 4130 + }, + { + "epoch": 0.25, + "learning_rate": 4.6785937431132596e-06, + "logits/chosen": -2.884094715118408, + "logits/rejected": -2.7508468627929688, + "logps/chosen": -99.2677993774414, + "logps/rejected": -1108.4849853515625, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23048047721385956, + "rewards/margins": 10.412751197814941, + "rewards/rejected": -10.643231391906738, + "step": 4140 + }, + { + "epoch": 0.25, + "learning_rate": 4.676036564370865e-06, + "logits/chosen": -2.9182515144348145, + "logits/rejected": -2.7855281829833984, + "logps/chosen": -86.19815063476562, + "logps/rejected": -973.0714111328125, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17939230799674988, + "rewards/margins": 9.110641479492188, + "rewards/rejected": -9.290034294128418, + "step": 4150 + }, + { + "epoch": 0.25, + "learning_rate": 4.6734699577357265e-06, + "logits/chosen": -2.952885627746582, + "logits/rejected": -2.8558971881866455, + "logps/chosen": -72.60917663574219, + "logps/rejected": -912.0119018554688, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023732393980026245, + "rewards/margins": 8.656572341918945, + "rewards/rejected": -8.680304527282715, + "step": 4160 + }, + { + "epoch": 0.25, + "learning_rate": 4.670893934327921e-06, + "logits/chosen": -2.945774555206299, + "logits/rejected": -2.8346006870269775, + "logps/chosen": -59.6494140625, + "logps/rejected": -1054.052734375, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0770690068602562, + "rewards/margins": 10.181549072265625, + "rewards/rejected": -10.104479789733887, + "step": 4170 + }, + { + "epoch": 0.25, + "learning_rate": 4.668308505308323e-06, + "logits/chosen": -2.905186176300049, + "logits/rejected": -2.8176016807556152, + "logps/chosen": -66.75899505615234, + "logps/rejected": -979.2154541015625, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03944163769483566, + "rewards/margins": 9.391366958618164, + "rewards/rejected": -9.351924896240234, + "step": 4180 + }, + { + "epoch": 0.25, + "learning_rate": 4.6657136818785596e-06, + "logits/chosen": -2.9032211303710938, + "logits/rejected": -2.816678524017334, + "logps/chosen": -74.63514709472656, + "logps/rejected": -1158.8536376953125, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07231833785772324, + "rewards/margins": 11.082597732543945, + "rewards/rejected": -11.154916763305664, + "step": 4190 + }, + { + "epoch": 0.25, + "learning_rate": 4.663109475280958e-06, + "logits/chosen": -2.9040133953094482, + "logits/rejected": -2.81174373626709, + "logps/chosen": -141.2930908203125, + "logps/rejected": -1220.878662109375, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7061454653739929, + "rewards/margins": 11.046578407287598, + "rewards/rejected": -11.752723693847656, + "step": 4200 + }, + { + "epoch": 0.25, + "learning_rate": 4.660495896798499e-06, + "logits/chosen": -2.9432120323181152, + "logits/rejected": -2.774690866470337, + "logps/chosen": -109.40970611572266, + "logps/rejected": -1126.1468505859375, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3487492501735687, + "rewards/margins": 10.462849617004395, + "rewards/rejected": -10.811599731445312, + "step": 4210 + }, + { + "epoch": 0.25, + "learning_rate": 4.65787295775477e-06, + "logits/chosen": -2.9040253162384033, + "logits/rejected": -2.774690866470337, + "logps/chosen": -72.62080383300781, + "logps/rejected": -966.7214965820312, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015141752548515797, + "rewards/margins": 9.208559036254883, + "rewards/rejected": -9.223701477050781, + "step": 4220 + }, + { + "epoch": 0.25, + "learning_rate": 4.655240669513913e-06, + "logits/chosen": -2.9094412326812744, + "logits/rejected": -2.8058242797851562, + "logps/chosen": -71.72166442871094, + "logps/rejected": -1042.1463623046875, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030352765694260597, + "rewards/margins": 9.953377723693848, + "rewards/rejected": -9.98373031616211, + "step": 4230 + }, + { + "epoch": 0.25, + "learning_rate": 4.652599043480574e-06, + "logits/chosen": -2.9107279777526855, + "logits/rejected": -2.8319244384765625, + "logps/chosen": -76.95477294921875, + "logps/rejected": -1065.914306640625, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08376533538103104, + "rewards/margins": 10.118742942810059, + "rewards/rejected": -10.202507972717285, + "step": 4240 + }, + { + "epoch": 0.25, + "learning_rate": 4.64994809109986e-06, + "logits/chosen": -2.9013326168060303, + "logits/rejected": -2.7833971977233887, + "logps/chosen": -106.92848205566406, + "logps/rejected": -995.4361572265625, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3284170627593994, + "rewards/margins": 9.189836502075195, + "rewards/rejected": -9.5182523727417, + "step": 4250 + }, + { + "epoch": 0.25, + "learning_rate": 4.647287823857283e-06, + "logits/chosen": -2.8964715003967285, + "logits/rejected": -2.762953281402588, + "logps/chosen": -159.28265380859375, + "logps/rejected": -1047.5933837890625, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9117465019226074, + "rewards/margins": 9.121123313903809, + "rewards/rejected": -10.032869338989258, + "step": 4260 + }, + { + "epoch": 0.25, + "learning_rate": 4.644618253278712e-06, + "logits/chosen": -2.9185850620269775, + "logits/rejected": -2.808415412902832, + "logps/chosen": -161.60824584960938, + "logps/rejected": -1039.034912109375, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9266613721847534, + "rewards/margins": 9.022591590881348, + "rewards/rejected": -9.949252128601074, + "step": 4270 + }, + { + "epoch": 0.26, + "learning_rate": 4.6419393909303254e-06, + "logits/chosen": -2.904066562652588, + "logits/rejected": -2.8094186782836914, + "logps/chosen": -145.71018981933594, + "logps/rejected": -1020.9894409179688, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.714751124382019, + "rewards/margins": 9.04936695098877, + "rewards/rejected": -9.764118194580078, + "step": 4280 + }, + { + "epoch": 0.26, + "learning_rate": 4.639251248418558e-06, + "logits/chosen": -2.9164223670959473, + "logits/rejected": -2.785963773727417, + "logps/chosen": -110.82879638671875, + "logps/rejected": -1146.018798828125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40823906660079956, + "rewards/margins": 10.603636741638184, + "rewards/rejected": -11.011876106262207, + "step": 4290 + }, + { + "epoch": 0.26, + "learning_rate": 4.636553837390051e-06, + "logits/chosen": -2.9003405570983887, + "logits/rejected": -2.7966482639312744, + "logps/chosen": -87.22411346435547, + "logps/rejected": -1113.9827880859375, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16813921928405762, + "rewards/margins": 10.537897109985352, + "rewards/rejected": -10.706036567687988, + "step": 4300 + }, + { + "epoch": 0.26, + "learning_rate": 4.6338471695316046e-06, + "logits/chosen": -2.8989522457122803, + "logits/rejected": -2.8207175731658936, + "logps/chosen": -108.576416015625, + "logps/rejected": -1153.6016845703125, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39755091071128845, + "rewards/margins": 10.700644493103027, + "rewards/rejected": -11.098196029663086, + "step": 4310 + }, + { + "epoch": 0.26, + "learning_rate": 4.631131256570124e-06, + "logits/chosen": -2.9059391021728516, + "logits/rejected": -2.791686534881592, + "logps/chosen": -125.39930725097656, + "logps/rejected": -1061.9052734375, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.539800763130188, + "rewards/margins": 9.600065231323242, + "rewards/rejected": -10.139867782592773, + "step": 4320 + }, + { + "epoch": 0.26, + "learning_rate": 4.628406110272568e-06, + "logits/chosen": -2.9412436485290527, + "logits/rejected": -2.79535174369812, + "logps/chosen": -104.64253997802734, + "logps/rejected": -1198.5625, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35039615631103516, + "rewards/margins": 11.188610076904297, + "rewards/rejected": -11.539005279541016, + "step": 4330 + }, + { + "epoch": 0.26, + "learning_rate": 4.625671742445903e-06, + "logits/chosen": -2.8668785095214844, + "logits/rejected": -2.7742056846618652, + "logps/chosen": -92.81754302978516, + "logps/rejected": -1065.806884765625, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.245576411485672, + "rewards/margins": 9.967387199401855, + "rewards/rejected": -10.212964057922363, + "step": 4340 + }, + { + "epoch": 0.26, + "learning_rate": 4.622928164937046e-06, + "logits/chosen": -2.91459059715271, + "logits/rejected": -2.792318820953369, + "logps/chosen": -108.16807556152344, + "logps/rejected": -1020.1964721679688, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38700878620147705, + "rewards/margins": 9.374357223510742, + "rewards/rejected": -9.76136589050293, + "step": 4350 + }, + { + "epoch": 0.26, + "learning_rate": 4.620175389632817e-06, + "logits/chosen": -2.8872015476226807, + "logits/rejected": -2.811877727508545, + "logps/chosen": -129.07662963867188, + "logps/rejected": -1071.6632080078125, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6013280749320984, + "rewards/margins": 9.681346893310547, + "rewards/rejected": -10.282674789428711, + "step": 4360 + }, + { + "epoch": 0.26, + "learning_rate": 4.617413428459887e-06, + "logits/chosen": -2.8861045837402344, + "logits/rejected": -2.7798550128936768, + "logps/chosen": -140.4359893798828, + "logps/rejected": -1040.20068359375, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7209984064102173, + "rewards/margins": 9.245370864868164, + "rewards/rejected": -9.96636962890625, + "step": 4370 + }, + { + "epoch": 0.26, + "learning_rate": 4.614642293384724e-06, + "logits/chosen": -2.883229970932007, + "logits/rejected": -2.77640700340271, + "logps/chosen": -126.4269027709961, + "logps/rejected": -1000.5406494140625, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5862837433815002, + "rewards/margins": 8.971501350402832, + "rewards/rejected": -9.557785034179688, + "step": 4380 + }, + { + "epoch": 0.26, + "learning_rate": 4.611861996413542e-06, + "logits/chosen": -2.905961275100708, + "logits/rejected": -2.8052828311920166, + "logps/chosen": -66.661865234375, + "logps/rejected": -999.3909301757812, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.053945302963256836, + "rewards/margins": 9.615189552307129, + "rewards/rejected": -9.561243057250977, + "step": 4390 + }, + { + "epoch": 0.26, + "learning_rate": 4.609072549592255e-06, + "logits/chosen": -2.8728692531585693, + "logits/rejected": -2.7769827842712402, + "logps/chosen": -78.50016021728516, + "logps/rejected": -1102.4761962890625, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02066270262002945, + "rewards/margins": 10.549295425415039, + "rewards/rejected": -10.569957733154297, + "step": 4400 + }, + { + "epoch": 0.26, + "learning_rate": 4.6062739650064135e-06, + "logits/chosen": -2.893392562866211, + "logits/rejected": -2.8116512298583984, + "logps/chosen": -91.8978271484375, + "logps/rejected": -963.6676025390625, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20641469955444336, + "rewards/margins": 8.978131294250488, + "rewards/rejected": -9.18454647064209, + "step": 4410 + }, + { + "epoch": 0.26, + "learning_rate": 4.603466254781162e-06, + "logits/chosen": -2.9127235412597656, + "logits/rejected": -2.815603017807007, + "logps/chosen": -96.66557312011719, + "logps/rejected": -1068.8868408203125, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30028194189071655, + "rewards/margins": 9.935462951660156, + "rewards/rejected": -10.23574447631836, + "step": 4420 + }, + { + "epoch": 0.26, + "learning_rate": 4.600649431081181e-06, + "logits/chosen": -2.911548137664795, + "logits/rejected": -2.821232318878174, + "logps/chosen": -161.19851684570312, + "logps/rejected": -1012.6959228515625, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.877530574798584, + "rewards/margins": 8.805020332336426, + "rewards/rejected": -9.682550430297852, + "step": 4430 + }, + { + "epoch": 0.26, + "learning_rate": 4.597823506110637e-06, + "logits/chosen": -2.866288661956787, + "logits/rejected": -2.797546863555908, + "logps/chosen": -199.53262329101562, + "logps/rejected": -1092.073974609375, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.308280348777771, + "rewards/margins": 9.171706199645996, + "rewards/rejected": -10.479988098144531, + "step": 4440 + }, + { + "epoch": 0.27, + "learning_rate": 4.594988492113128e-06, + "logits/chosen": -2.92360782623291, + "logits/rejected": -2.8020482063293457, + "logps/chosen": -95.24528503417969, + "logps/rejected": -1000.21337890625, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2674865126609802, + "rewards/margins": 9.31503677368164, + "rewards/rejected": -9.582524299621582, + "step": 4450 + }, + { + "epoch": 0.27, + "learning_rate": 4.592144401371632e-06, + "logits/chosen": -2.913630247116089, + "logits/rejected": -2.8257508277893066, + "logps/chosen": -146.58274841308594, + "logps/rejected": -1209.158447265625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7044652104377747, + "rewards/margins": 10.943525314331055, + "rewards/rejected": -11.647990226745605, + "step": 4460 + }, + { + "epoch": 0.27, + "learning_rate": 4.5892912462084515e-06, + "logits/chosen": -2.9358959197998047, + "logits/rejected": -2.7742631435394287, + "logps/chosen": -114.9584732055664, + "logps/rejected": -1048.086669921875, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4246769845485687, + "rewards/margins": 9.612180709838867, + "rewards/rejected": -10.036857604980469, + "step": 4470 + }, + { + "epoch": 0.27, + "learning_rate": 4.586429038985163e-06, + "logits/chosen": -2.9011335372924805, + "logits/rejected": -2.7730495929718018, + "logps/chosen": -70.93653869628906, + "logps/rejected": -1152.823486328125, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011950431391596794, + "rewards/margins": 11.079075813293457, + "rewards/rejected": -11.091026306152344, + "step": 4480 + }, + { + "epoch": 0.27, + "learning_rate": 4.583557792102559e-06, + "logits/chosen": -2.9267096519470215, + "logits/rejected": -2.8298516273498535, + "logps/chosen": -76.23814392089844, + "logps/rejected": -1001.984375, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0638723373413086, + "rewards/margins": 9.523317337036133, + "rewards/rejected": -9.587187767028809, + "step": 4490 + }, + { + "epoch": 0.27, + "learning_rate": 4.580677518000604e-06, + "logits/chosen": -2.9086670875549316, + "logits/rejected": -2.801614761352539, + "logps/chosen": -71.93829345703125, + "logps/rejected": -1079.363525390625, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009500850923359394, + "rewards/margins": 10.335481643676758, + "rewards/rejected": -10.344982147216797, + "step": 4500 + }, + { + "epoch": 0.27, + "learning_rate": 4.577788229158364e-06, + "logits/chosen": -2.9079577922821045, + "logits/rejected": -2.7817137241363525, + "logps/chosen": -76.71377563476562, + "logps/rejected": -949.2999267578125, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06510486453771591, + "rewards/margins": 8.988740921020508, + "rewards/rejected": -9.053844451904297, + "step": 4510 + }, + { + "epoch": 0.27, + "learning_rate": 4.574889938093971e-06, + "logits/chosen": -2.893267869949341, + "logits/rejected": -2.802339553833008, + "logps/chosen": -94.14060974121094, + "logps/rejected": -1111.646484375, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20732493698596954, + "rewards/margins": 10.476593017578125, + "rewards/rejected": -10.683917999267578, + "step": 4520 + }, + { + "epoch": 0.27, + "learning_rate": 4.571982657364555e-06, + "logits/chosen": -2.8677310943603516, + "logits/rejected": -2.7809898853302, + "logps/chosen": -119.81768798828125, + "logps/rejected": -1099.111083984375, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4681376814842224, + "rewards/margins": 10.077434539794922, + "rewards/rejected": -10.545572280883789, + "step": 4530 + }, + { + "epoch": 0.27, + "learning_rate": 4.569066399566196e-06, + "logits/chosen": -2.9095661640167236, + "logits/rejected": -2.819188356399536, + "logps/chosen": -108.3870849609375, + "logps/rejected": -1139.8427734375, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41503772139549255, + "rewards/margins": 10.548416137695312, + "rewards/rejected": -10.96345329284668, + "step": 4540 + }, + { + "epoch": 0.27, + "learning_rate": 4.566141177333871e-06, + "logits/chosen": -2.9172446727752686, + "logits/rejected": -2.810838222503662, + "logps/chosen": -116.5077896118164, + "logps/rejected": -1068.036376953125, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5008957982063293, + "rewards/margins": 9.735939025878906, + "rewards/rejected": -10.236834526062012, + "step": 4550 + }, + { + "epoch": 0.27, + "learning_rate": 4.563207003341389e-06, + "logits/chosen": -2.8966095447540283, + "logits/rejected": -2.8229918479919434, + "logps/chosen": -99.80728912353516, + "logps/rejected": -1024.94921875, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22795113921165466, + "rewards/margins": 9.57709789276123, + "rewards/rejected": -9.805047988891602, + "step": 4560 + }, + { + "epoch": 0.27, + "learning_rate": 4.56026389030135e-06, + "logits/chosen": -2.876563310623169, + "logits/rejected": -2.802300214767456, + "logps/chosen": -84.9843978881836, + "logps/rejected": -940.0216064453125, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12979432940483093, + "rewards/margins": 8.832286834716797, + "rewards/rejected": -8.962080955505371, + "step": 4570 + }, + { + "epoch": 0.27, + "learning_rate": 4.557311850965081e-06, + "logits/chosen": -2.9038519859313965, + "logits/rejected": -2.808375835418701, + "logps/chosen": -66.52571105957031, + "logps/rejected": -1105.466064453125, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018833911046385765, + "rewards/margins": 10.632471084594727, + "rewards/rejected": -10.613636016845703, + "step": 4580 + }, + { + "epoch": 0.27, + "learning_rate": 4.554350898122585e-06, + "logits/chosen": -2.9017598628997803, + "logits/rejected": -2.825206995010376, + "logps/chosen": -82.07447814941406, + "logps/rejected": -937.4646606445312, + "loss": 0.0758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14081144332885742, + "rewards/margins": 8.786211013793945, + "rewards/rejected": -8.927021980285645, + "step": 4590 + }, + { + "epoch": 0.27, + "learning_rate": 4.551381044602478e-06, + "logits/chosen": -2.9548227787017822, + "logits/rejected": -2.8534159660339355, + "logps/chosen": -71.24727630615234, + "logps/rejected": -1036.587158203125, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04382842406630516, + "rewards/margins": 9.880915641784668, + "rewards/rejected": -9.924745559692383, + "step": 4600 + }, + { + "epoch": 0.27, + "learning_rate": 4.548402303271946e-06, + "logits/chosen": -2.8801655769348145, + "logits/rejected": -2.8027002811431885, + "logps/chosen": -61.337425231933594, + "logps/rejected": -1094.833251953125, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.056496210396289825, + "rewards/margins": 10.569719314575195, + "rewards/rejected": -10.513221740722656, + "step": 4610 + }, + { + "epoch": 0.28, + "learning_rate": 4.5454146870366775e-06, + "logits/chosen": -2.8988003730773926, + "logits/rejected": -2.8232078552246094, + "logps/chosen": -84.91638946533203, + "logps/rejected": -1046.2430419921875, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11448167264461517, + "rewards/margins": 9.906743049621582, + "rewards/rejected": -10.021224021911621, + "step": 4620 + }, + { + "epoch": 0.28, + "learning_rate": 4.542418208840816e-06, + "logits/chosen": -2.9094996452331543, + "logits/rejected": -2.83848237991333, + "logps/chosen": -69.73999786376953, + "logps/rejected": -972.4375, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01323959231376648, + "rewards/margins": 9.26408576965332, + "rewards/rejected": -9.277325630187988, + "step": 4630 + }, + { + "epoch": 0.28, + "learning_rate": 4.539412881666896e-06, + "logits/chosen": -2.9168286323547363, + "logits/rejected": -2.82263445854187, + "logps/chosen": -75.34038543701172, + "logps/rejected": -1006.7355346679688, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03974676877260208, + "rewards/margins": 9.594426155090332, + "rewards/rejected": -9.634172439575195, + "step": 4640 + }, + { + "epoch": 0.28, + "learning_rate": 4.536398718535795e-06, + "logits/chosen": -2.91206955909729, + "logits/rejected": -2.83095383644104, + "logps/chosen": -81.1929702758789, + "logps/rejected": -1025.0313720703125, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09227331727743149, + "rewards/margins": 9.729327201843262, + "rewards/rejected": -9.821599960327148, + "step": 4650 + }, + { + "epoch": 0.28, + "learning_rate": 4.5333757325066715e-06, + "logits/chosen": -2.8676838874816895, + "logits/rejected": -2.777498722076416, + "logps/chosen": -81.93919372558594, + "logps/rejected": -1140.127197265625, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10093430429697037, + "rewards/margins": 10.844438552856445, + "rewards/rejected": -10.945371627807617, + "step": 4660 + }, + { + "epoch": 0.28, + "learning_rate": 4.5303439366769095e-06, + "logits/chosen": -2.906921863555908, + "logits/rejected": -2.7776081562042236, + "logps/chosen": -93.50048065185547, + "logps/rejected": -1020.9957275390625, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2694311738014221, + "rewards/margins": 9.503682136535645, + "rewards/rejected": -9.773112297058105, + "step": 4670 + }, + { + "epoch": 0.28, + "learning_rate": 4.527303344182065e-06, + "logits/chosen": -2.859835147857666, + "logits/rejected": -2.7575488090515137, + "logps/chosen": -106.78855895996094, + "logps/rejected": -1145.6884765625, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3507247865200043, + "rewards/margins": 10.670522689819336, + "rewards/rejected": -11.021248817443848, + "step": 4680 + }, + { + "epoch": 0.28, + "learning_rate": 4.524253968195802e-06, + "logits/chosen": -2.9059767723083496, + "logits/rejected": -2.810584783554077, + "logps/chosen": -101.52924346923828, + "logps/rejected": -1136.054931640625, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3442736864089966, + "rewards/margins": 10.585061073303223, + "rewards/rejected": -10.92933464050293, + "step": 4690 + }, + { + "epoch": 0.28, + "learning_rate": 4.521195821929843e-06, + "logits/chosen": -2.9197306632995605, + "logits/rejected": -2.7863106727600098, + "logps/chosen": -105.24568176269531, + "logps/rejected": -976.8414306640625, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3304779827594757, + "rewards/margins": 8.985427856445312, + "rewards/rejected": -9.31590461730957, + "step": 4700 + }, + { + "epoch": 0.28, + "learning_rate": 4.5181289186339085e-06, + "logits/chosen": -2.893448829650879, + "logits/rejected": -2.7539124488830566, + "logps/chosen": -100.33922576904297, + "logps/rejected": -984.7218017578125, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2708638310432434, + "rewards/margins": 9.130024909973145, + "rewards/rejected": -9.40088939666748, + "step": 4710 + }, + { + "epoch": 0.28, + "learning_rate": 4.51505327159566e-06, + "logits/chosen": -2.937783718109131, + "logits/rejected": -2.80378794670105, + "logps/chosen": -76.32362365722656, + "logps/rejected": -997.26123046875, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07866490632295609, + "rewards/margins": 9.43741226196289, + "rewards/rejected": -9.516077041625977, + "step": 4720 + }, + { + "epoch": 0.28, + "learning_rate": 4.511968894140639e-06, + "logits/chosen": -2.9309630393981934, + "logits/rejected": -2.826199531555176, + "logps/chosen": -89.91520690917969, + "logps/rejected": -880.4615478515625, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13478204607963562, + "rewards/margins": 8.2269926071167, + "rewards/rejected": -8.361775398254395, + "step": 4730 + }, + { + "epoch": 0.28, + "learning_rate": 4.508875799632215e-06, + "logits/chosen": -2.9290318489074707, + "logits/rejected": -2.8303656578063965, + "logps/chosen": -131.4691619873047, + "logps/rejected": -1089.2655029296875, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5839880108833313, + "rewards/margins": 9.863065719604492, + "rewards/rejected": -10.447053909301758, + "step": 4740 + }, + { + "epoch": 0.28, + "learning_rate": 4.505774001471527e-06, + "logits/chosen": -2.9127578735351562, + "logits/rejected": -2.7932186126708984, + "logps/chosen": -78.004150390625, + "logps/rejected": -1080.9903564453125, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10248641669750214, + "rewards/margins": 10.269338607788086, + "rewards/rejected": -10.371824264526367, + "step": 4750 + }, + { + "epoch": 0.28, + "learning_rate": 4.502663513097419e-06, + "logits/chosen": -2.926621198654175, + "logits/rejected": -2.77805757522583, + "logps/chosen": -92.62043762207031, + "logps/rejected": -1071.267578125, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1928451955318451, + "rewards/margins": 10.071054458618164, + "rewards/rejected": -10.263900756835938, + "step": 4760 + }, + { + "epoch": 0.28, + "learning_rate": 4.499544347986388e-06, + "logits/chosen": -2.8830108642578125, + "logits/rejected": -2.7515289783477783, + "logps/chosen": -89.59968566894531, + "logps/rejected": -1003.7515869140625, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2422805279493332, + "rewards/margins": 9.363607406616211, + "rewards/rejected": -9.605888366699219, + "step": 4770 + }, + { + "epoch": 0.29, + "learning_rate": 4.4964165196525255e-06, + "logits/chosen": -2.941171407699585, + "logits/rejected": -2.8077425956726074, + "logps/chosen": -96.42909240722656, + "logps/rejected": -1170.781005859375, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2246961146593094, + "rewards/margins": 11.036178588867188, + "rewards/rejected": -11.260873794555664, + "step": 4780 + }, + { + "epoch": 0.29, + "learning_rate": 4.493280041647454e-06, + "logits/chosen": -2.9356400966644287, + "logits/rejected": -2.780233144760132, + "logps/chosen": -119.87623596191406, + "logps/rejected": -1087.643310546875, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46486926078796387, + "rewards/margins": 9.971296310424805, + "rewards/rejected": -10.436163902282715, + "step": 4790 + }, + { + "epoch": 0.29, + "learning_rate": 4.490134927560276e-06, + "logits/chosen": -2.92179799079895, + "logits/rejected": -2.7985856533050537, + "logps/chosen": -123.69156646728516, + "logps/rejected": -1119.8121337890625, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5838445425033569, + "rewards/margins": 10.174220085144043, + "rewards/rejected": -10.758064270019531, + "step": 4800 + }, + { + "epoch": 0.29, + "learning_rate": 4.486981191017505e-06, + "logits/chosen": -2.928393840789795, + "logits/rejected": -2.760741710662842, + "logps/chosen": -92.12145233154297, + "logps/rejected": -1152.6842041015625, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23580631613731384, + "rewards/margins": 10.84332275390625, + "rewards/rejected": -11.07912826538086, + "step": 4810 + }, + { + "epoch": 0.29, + "learning_rate": 4.4838188456830175e-06, + "logits/chosen": -2.8960201740264893, + "logits/rejected": -2.796502113342285, + "logps/chosen": -83.48484802246094, + "logps/rejected": -1064.636962890625, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12085038423538208, + "rewards/margins": 10.091634750366211, + "rewards/rejected": -10.212484359741211, + "step": 4820 + }, + { + "epoch": 0.29, + "learning_rate": 4.480647905257985e-06, + "logits/chosen": -2.928720474243164, + "logits/rejected": -2.818305015563965, + "logps/chosen": -96.21577453613281, + "logps/rejected": -1090.26708984375, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2710864543914795, + "rewards/margins": 10.182416915893555, + "rewards/rejected": -10.45350456237793, + "step": 4830 + }, + { + "epoch": 0.29, + "learning_rate": 4.47746838348082e-06, + "logits/chosen": -2.9394912719726562, + "logits/rejected": -2.8059401512145996, + "logps/chosen": -75.38832092285156, + "logps/rejected": -980.80712890625, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05348791554570198, + "rewards/margins": 9.331521987915039, + "rewards/rejected": -9.385010719299316, + "step": 4840 + }, + { + "epoch": 0.29, + "learning_rate": 4.474280294127112e-06, + "logits/chosen": -2.9221832752227783, + "logits/rejected": -2.852569580078125, + "logps/chosen": -73.40013122558594, + "logps/rejected": -1074.966064453125, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08040627092123032, + "rewards/margins": 10.232905387878418, + "rewards/rejected": -10.313311576843262, + "step": 4850 + }, + { + "epoch": 0.29, + "learning_rate": 4.471083651009574e-06, + "logits/chosen": -2.9261255264282227, + "logits/rejected": -2.8120765686035156, + "logps/chosen": -87.65428161621094, + "logps/rejected": -1101.6876220703125, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2045680582523346, + "rewards/margins": 10.38014030456543, + "rewards/rejected": -10.584708213806152, + "step": 4860 + }, + { + "epoch": 0.29, + "learning_rate": 4.4678784679779766e-06, + "logits/chosen": -2.9147346019744873, + "logits/rejected": -2.8132572174072266, + "logps/chosen": -73.7406997680664, + "logps/rejected": -1141.0323486328125, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0162602998316288, + "rewards/margins": 10.989324569702148, + "rewards/rejected": -10.973065376281738, + "step": 4870 + }, + { + "epoch": 0.29, + "learning_rate": 4.464664758919092e-06, + "logits/chosen": -2.9283194541931152, + "logits/rejected": -2.81685733795166, + "logps/chosen": -95.83625793457031, + "logps/rejected": -1124.4720458984375, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23186016082763672, + "rewards/margins": 10.574603080749512, + "rewards/rejected": -10.806463241577148, + "step": 4880 + }, + { + "epoch": 0.29, + "learning_rate": 4.461442537756629e-06, + "logits/chosen": -2.8623392581939697, + "logits/rejected": -2.753228187561035, + "logps/chosen": -108.88375091552734, + "logps/rejected": -1113.374267578125, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36459964513778687, + "rewards/margins": 10.332502365112305, + "rewards/rejected": -10.697102546691895, + "step": 4890 + }, + { + "epoch": 0.29, + "learning_rate": 4.458211818451179e-06, + "logits/chosen": -2.9641549587249756, + "logits/rejected": -2.8256726264953613, + "logps/chosen": -75.55189514160156, + "logps/rejected": -1139.4482421875, + "loss": 0.0313, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1000233069062233, + "rewards/margins": 10.85367488861084, + "rewards/rejected": -10.953699111938477, + "step": 4900 + }, + { + "epoch": 0.29, + "learning_rate": 4.454972615000153e-06, + "logits/chosen": -2.8726203441619873, + "logits/rejected": -2.7775096893310547, + "logps/chosen": -76.64668273925781, + "logps/rejected": -922.6935424804688, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05730264633893967, + "rewards/margins": 8.723938941955566, + "rewards/rejected": -8.781240463256836, + "step": 4910 + }, + { + "epoch": 0.29, + "learning_rate": 4.451724941437718e-06, + "logits/chosen": -2.9053893089294434, + "logits/rejected": -2.7921721935272217, + "logps/chosen": -79.14024353027344, + "logps/rejected": -1114.6865234375, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008538919501006603, + "rewards/margins": 10.700807571411133, + "rewards/rejected": -10.709346771240234, + "step": 4920 + }, + { + "epoch": 0.29, + "learning_rate": 4.448468811834739e-06, + "logits/chosen": -2.932300329208374, + "logits/rejected": -2.854891538619995, + "logps/chosen": -78.27799987792969, + "logps/rejected": -904.4276123046875, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11643964052200317, + "rewards/margins": 8.488099098205566, + "rewards/rejected": -8.604537963867188, + "step": 4930 + }, + { + "epoch": 0.29, + "learning_rate": 4.445204240298718e-06, + "logits/chosen": -2.9165663719177246, + "logits/rejected": -2.803201913833618, + "logps/chosen": -72.69478607177734, + "logps/rejected": -1076.4212646484375, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03252802789211273, + "rewards/margins": 10.271202087402344, + "rewards/rejected": -10.303731918334961, + "step": 4940 + }, + { + "epoch": 0.3, + "learning_rate": 4.441931240973735e-06, + "logits/chosen": -2.94026780128479, + "logits/rejected": -2.8018202781677246, + "logps/chosen": -95.87989807128906, + "logps/rejected": -1074.091064453125, + "loss": 0.0373, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2970934808254242, + "rewards/margins": 10.007808685302734, + "rewards/rejected": -10.304903030395508, + "step": 4950 + }, + { + "epoch": 0.3, + "learning_rate": 4.43864982804038e-06, + "logits/chosen": -2.9383206367492676, + "logits/rejected": -2.796290636062622, + "logps/chosen": -94.48403930664062, + "logps/rejected": -1010.3338623046875, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20962488651275635, + "rewards/margins": 9.447382926940918, + "rewards/rejected": -9.657008171081543, + "step": 4960 + }, + { + "epoch": 0.3, + "learning_rate": 4.435360015715697e-06, + "logits/chosen": -2.9326417446136475, + "logits/rejected": -2.766787052154541, + "logps/chosen": -96.70450592041016, + "logps/rejected": -1143.965576171875, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23923973739147186, + "rewards/margins": 10.757135391235352, + "rewards/rejected": -10.996376037597656, + "step": 4970 + }, + { + "epoch": 0.3, + "learning_rate": 4.4320618182531244e-06, + "logits/chosen": -2.8783507347106934, + "logits/rejected": -2.7696948051452637, + "logps/chosen": -96.92609405517578, + "logps/rejected": -1114.8580322265625, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23641660809516907, + "rewards/margins": 10.473978996276855, + "rewards/rejected": -10.710395812988281, + "step": 4980 + }, + { + "epoch": 0.3, + "learning_rate": 4.428755249942425e-06, + "logits/chosen": -2.94284725189209, + "logits/rejected": -2.805267810821533, + "logps/chosen": -88.04710388183594, + "logps/rejected": -1128.584716796875, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15025809407234192, + "rewards/margins": 10.699899673461914, + "rewards/rejected": -10.850159645080566, + "step": 4990 + }, + { + "epoch": 0.3, + "learning_rate": 4.4254403251096345e-06, + "logits/chosen": -2.9280588626861572, + "logits/rejected": -2.808825969696045, + "logps/chosen": -106.11286926269531, + "logps/rejected": -1052.760009765625, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37431150674819946, + "rewards/margins": 9.705513000488281, + "rewards/rejected": -10.079824447631836, + "step": 5000 + }, + { + "epoch": 0.3, + "learning_rate": 4.422117058116989e-06, + "logits/chosen": -2.8958401679992676, + "logits/rejected": -2.794448137283325, + "logps/chosen": -108.98091125488281, + "logps/rejected": -1148.6414794921875, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35968703031539917, + "rewards/margins": 10.67334270477295, + "rewards/rejected": -11.033029556274414, + "step": 5010 + }, + { + "epoch": 0.3, + "learning_rate": 4.418785463362871e-06, + "logits/chosen": -2.918119430541992, + "logits/rejected": -2.799116849899292, + "logps/chosen": -100.15885162353516, + "logps/rejected": -1211.322998046875, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2800785005092621, + "rewards/margins": 11.365874290466309, + "rewards/rejected": -11.645952224731445, + "step": 5020 + }, + { + "epoch": 0.3, + "learning_rate": 4.415445555281742e-06, + "logits/chosen": -2.90566086769104, + "logits/rejected": -2.805044174194336, + "logps/chosen": -115.4151611328125, + "logps/rejected": -1186.9598388671875, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4487641751766205, + "rewards/margins": 10.98048210144043, + "rewards/rejected": -11.429245948791504, + "step": 5030 + }, + { + "epoch": 0.3, + "learning_rate": 4.412097348344084e-06, + "logits/chosen": -2.890604257583618, + "logits/rejected": -2.7863199710845947, + "logps/chosen": -94.54801940917969, + "logps/rejected": -1265.533447265625, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20007574558258057, + "rewards/margins": 12.013483047485352, + "rewards/rejected": -12.213560104370117, + "step": 5040 + }, + { + "epoch": 0.3, + "learning_rate": 4.408740857056332e-06, + "logits/chosen": -2.9250786304473877, + "logits/rejected": -2.796694040298462, + "logps/chosen": -113.1000747680664, + "logps/rejected": -1204.5736083984375, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.441622793674469, + "rewards/margins": 11.159921646118164, + "rewards/rejected": -11.601545333862305, + "step": 5050 + }, + { + "epoch": 0.3, + "learning_rate": 4.405376095960816e-06, + "logits/chosen": -2.9202957153320312, + "logits/rejected": -2.794678211212158, + "logps/chosen": -114.0595703125, + "logps/rejected": -1091.915283203125, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4511253237724304, + "rewards/margins": 10.023406028747559, + "rewards/rejected": -10.474531173706055, + "step": 5060 + }, + { + "epoch": 0.3, + "learning_rate": 4.402003079635695e-06, + "logits/chosen": -2.926386594772339, + "logits/rejected": -2.801600694656372, + "logps/chosen": -120.9647216796875, + "logps/rejected": -1005.9732666015625, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5526293516159058, + "rewards/margins": 9.075660705566406, + "rewards/rejected": -9.628290176391602, + "step": 5070 + }, + { + "epoch": 0.3, + "learning_rate": 4.398621822694894e-06, + "logits/chosen": -2.906467914581299, + "logits/rejected": -2.8236374855041504, + "logps/chosen": -71.70893859863281, + "logps/rejected": -1108.146728515625, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005425202660262585, + "rewards/margins": 10.629765510559082, + "rewards/rejected": -10.62433910369873, + "step": 5080 + }, + { + "epoch": 0.3, + "learning_rate": 4.3952323397880426e-06, + "logits/chosen": -2.89420747756958, + "logits/rejected": -2.778454303741455, + "logps/chosen": -69.36300659179688, + "logps/rejected": -906.44921875, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009314288385212421, + "rewards/margins": 8.618794441223145, + "rewards/rejected": -8.609479904174805, + "step": 5090 + }, + { + "epoch": 0.3, + "learning_rate": 4.391834645600408e-06, + "logits/chosen": -2.9491798877716064, + "logits/rejected": -2.8280227184295654, + "logps/chosen": -76.1402587890625, + "logps/rejected": -1026.66015625, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0630360096693039, + "rewards/margins": 9.7501859664917, + "rewards/rejected": -9.813223838806152, + "step": 5100 + }, + { + "epoch": 0.3, + "learning_rate": 4.388428754852835e-06, + "logits/chosen": -2.878732681274414, + "logits/rejected": -2.78424072265625, + "logps/chosen": -80.48857116699219, + "logps/rejected": -972.7552490234375, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.134246364235878, + "rewards/margins": 9.154767990112305, + "rewards/rejected": -9.28901481628418, + "step": 5110 + }, + { + "epoch": 0.31, + "learning_rate": 4.385014682301682e-06, + "logits/chosen": -2.9347071647644043, + "logits/rejected": -2.7972300052642822, + "logps/chosen": -91.8586654663086, + "logps/rejected": -1073.089111328125, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22346599400043488, + "rewards/margins": 10.061551094055176, + "rewards/rejected": -10.285016059875488, + "step": 5120 + }, + { + "epoch": 0.31, + "learning_rate": 4.381592442738753e-06, + "logits/chosen": -2.8769049644470215, + "logits/rejected": -2.7817323207855225, + "logps/chosen": -66.82271575927734, + "logps/rejected": -978.7667236328125, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01835566759109497, + "rewards/margins": 9.366061210632324, + "rewards/rejected": -9.347704887390137, + "step": 5130 + }, + { + "epoch": 0.31, + "learning_rate": 4.3781620509912395e-06, + "logits/chosen": -2.8907856941223145, + "logits/rejected": -2.7915472984313965, + "logps/chosen": -84.63075256347656, + "logps/rejected": -1095.581298828125, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10715818405151367, + "rewards/margins": 10.430553436279297, + "rewards/rejected": -10.537710189819336, + "step": 5140 + }, + { + "epoch": 0.31, + "learning_rate": 4.374723521921651e-06, + "logits/chosen": -2.895655632019043, + "logits/rejected": -2.7703962326049805, + "logps/chosen": -101.97102355957031, + "logps/rejected": -1050.6082763671875, + "loss": 0.061, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.26692596077919006, + "rewards/margins": 9.794083595275879, + "rewards/rejected": -10.061009407043457, + "step": 5150 + }, + { + "epoch": 0.31, + "learning_rate": 4.3712768704277535e-06, + "logits/chosen": -2.924736738204956, + "logits/rejected": -2.762291431427002, + "logps/chosen": -107.10150146484375, + "logps/rejected": -1177.926513671875, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35556110739707947, + "rewards/margins": 10.972970962524414, + "rewards/rejected": -11.328532218933105, + "step": 5160 + }, + { + "epoch": 0.31, + "learning_rate": 4.367822111442504e-06, + "logits/chosen": -2.9397132396698, + "logits/rejected": -2.830716848373413, + "logps/chosen": -87.31817626953125, + "logps/rejected": -1015.3263549804688, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1307411640882492, + "rewards/margins": 9.583714485168457, + "rewards/rejected": -9.714456558227539, + "step": 5170 + }, + { + "epoch": 0.31, + "learning_rate": 4.364359259933985e-06, + "logits/chosen": -2.914748430252075, + "logits/rejected": -2.792332649230957, + "logps/chosen": -109.8978271484375, + "logps/rejected": -1092.9752197265625, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36138713359832764, + "rewards/margins": 10.136848449707031, + "rewards/rejected": -10.498235702514648, + "step": 5180 + }, + { + "epoch": 0.31, + "learning_rate": 4.3608883309053425e-06, + "logits/chosen": -2.935044527053833, + "logits/rejected": -2.8275039196014404, + "logps/chosen": -137.51113891601562, + "logps/rejected": -978.7269287109375, + "loss": 0.0477, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6427118182182312, + "rewards/margins": 8.703424453735352, + "rewards/rejected": -9.346137046813965, + "step": 5190 + }, + { + "epoch": 0.31, + "learning_rate": 4.35740933939472e-06, + "logits/chosen": -2.8802666664123535, + "logits/rejected": -2.801466464996338, + "logps/chosen": -112.32859802246094, + "logps/rejected": -1065.3653564453125, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3624281585216522, + "rewards/margins": 9.844383239746094, + "rewards/rejected": -10.206811904907227, + "step": 5200 + }, + { + "epoch": 0.31, + "learning_rate": 4.353922300475189e-06, + "logits/chosen": -2.9154207706451416, + "logits/rejected": -2.7947864532470703, + "logps/chosen": -90.10201263427734, + "logps/rejected": -1063.709228515625, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21142025291919708, + "rewards/margins": 9.986998558044434, + "rewards/rejected": -10.198419570922852, + "step": 5210 + }, + { + "epoch": 0.31, + "learning_rate": 4.350427229254689e-06, + "logits/chosen": -2.8895621299743652, + "logits/rejected": -2.805819034576416, + "logps/chosen": -73.00257873535156, + "logps/rejected": -948.2406005859375, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03478895127773285, + "rewards/margins": 9.013351440429688, + "rewards/rejected": -9.048139572143555, + "step": 5220 + }, + { + "epoch": 0.31, + "learning_rate": 4.346924140875961e-06, + "logits/chosen": -2.8794350624084473, + "logits/rejected": -2.7719314098358154, + "logps/chosen": -75.07901763916016, + "logps/rejected": -1007.3990478515625, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03966792672872543, + "rewards/margins": 9.588266372680664, + "rewards/rejected": -9.627934455871582, + "step": 5230 + }, + { + "epoch": 0.31, + "learning_rate": 4.34341305051648e-06, + "logits/chosen": -2.85591459274292, + "logits/rejected": -2.734689235687256, + "logps/chosen": -86.8490219116211, + "logps/rejected": -1034.8328857421875, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16776810586452484, + "rewards/margins": 9.731412887573242, + "rewards/rejected": -9.899181365966797, + "step": 5240 + }, + { + "epoch": 0.31, + "learning_rate": 4.339893973388392e-06, + "logits/chosen": -2.9150335788726807, + "logits/rejected": -2.8110556602478027, + "logps/chosen": -116.9923324584961, + "logps/rejected": -999.41455078125, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5206400752067566, + "rewards/margins": 9.03512954711914, + "rewards/rejected": -9.555768966674805, + "step": 5250 + }, + { + "epoch": 0.31, + "learning_rate": 4.3363669247384446e-06, + "logits/chosen": -2.937760591506958, + "logits/rejected": -2.842745304107666, + "logps/chosen": -85.13516998291016, + "logps/rejected": -1062.99609375, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1540161669254303, + "rewards/margins": 10.041516304016113, + "rewards/rejected": -10.195531845092773, + "step": 5260 + }, + { + "epoch": 0.31, + "learning_rate": 4.332831919847922e-06, + "logits/chosen": -2.9191360473632812, + "logits/rejected": -2.8050339221954346, + "logps/chosen": -113.01680755615234, + "logps/rejected": -957.55078125, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4381740987300873, + "rewards/margins": 8.692211151123047, + "rewards/rejected": -9.130385398864746, + "step": 5270 + }, + { + "epoch": 0.31, + "learning_rate": 4.329288974032583e-06, + "logits/chosen": -2.905866861343384, + "logits/rejected": -2.773118495941162, + "logps/chosen": -123.447509765625, + "logps/rejected": -1007.7584228515625, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45142611861228943, + "rewards/margins": 9.184259414672852, + "rewards/rejected": -9.635684967041016, + "step": 5280 + }, + { + "epoch": 0.32, + "learning_rate": 4.325738102642589e-06, + "logits/chosen": -2.9166271686553955, + "logits/rejected": -2.7881298065185547, + "logps/chosen": -87.07463836669922, + "logps/rejected": -1023.7344970703125, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15132933855056763, + "rewards/margins": 9.644368171691895, + "rewards/rejected": -9.795698165893555, + "step": 5290 + }, + { + "epoch": 0.32, + "learning_rate": 4.322179321062439e-06, + "logits/chosen": -2.900089979171753, + "logits/rejected": -2.793597459793091, + "logps/chosen": -97.57390594482422, + "logps/rejected": -1073.473876953125, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3010684847831726, + "rewards/margins": 9.992565155029297, + "rewards/rejected": -10.293633460998535, + "step": 5300 + }, + { + "epoch": 0.32, + "learning_rate": 4.318612644710906e-06, + "logits/chosen": -2.8975777626037598, + "logits/rejected": -2.829590082168579, + "logps/chosen": -86.01814270019531, + "logps/rejected": -1105.52685546875, + "loss": 0.0324, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.16788221895694733, + "rewards/margins": 10.444056510925293, + "rewards/rejected": -10.6119384765625, + "step": 5310 + }, + { + "epoch": 0.32, + "learning_rate": 4.315038089040965e-06, + "logits/chosen": -2.9081172943115234, + "logits/rejected": -2.7894349098205566, + "logps/chosen": -61.70881271362305, + "logps/rejected": -1040.346435546875, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.043680790811777115, + "rewards/margins": 10.004947662353516, + "rewards/rejected": -9.961265563964844, + "step": 5320 + }, + { + "epoch": 0.32, + "learning_rate": 4.311455669539732e-06, + "logits/chosen": -2.905578136444092, + "logits/rejected": -2.81683087348938, + "logps/chosen": -65.98013305664062, + "logps/rejected": -1031.008056640625, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.051032789051532745, + "rewards/margins": 9.92083740234375, + "rewards/rejected": -9.869805335998535, + "step": 5330 + }, + { + "epoch": 0.32, + "learning_rate": 4.307865401728392e-06, + "logits/chosen": -2.9275906085968018, + "logits/rejected": -2.8506433963775635, + "logps/chosen": -69.98945617675781, + "logps/rejected": -1111.661376953125, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03507096320390701, + "rewards/margins": 10.716146469116211, + "rewards/rejected": -10.681076049804688, + "step": 5340 + }, + { + "epoch": 0.32, + "learning_rate": 4.3042673011621334e-06, + "logits/chosen": -2.8653833866119385, + "logits/rejected": -2.756660223007202, + "logps/chosen": -66.45137786865234, + "logps/rejected": -1089.083984375, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019074004143476486, + "rewards/margins": 10.478767395019531, + "rewards/rejected": -10.459692001342773, + "step": 5350 + }, + { + "epoch": 0.32, + "learning_rate": 4.300661383430081e-06, + "logits/chosen": -2.9260733127593994, + "logits/rejected": -2.8223023414611816, + "logps/chosen": -72.69377136230469, + "logps/rejected": -1047.390869140625, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07252846658229828, + "rewards/margins": 9.955301284790039, + "rewards/rejected": -10.027830123901367, + "step": 5360 + }, + { + "epoch": 0.32, + "learning_rate": 4.2970476641552304e-06, + "logits/chosen": -2.912391424179077, + "logits/rejected": -2.8059020042419434, + "logps/chosen": -68.8610610961914, + "logps/rejected": -1084.7510986328125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030656863003969193, + "rewards/margins": 10.447047233581543, + "rewards/rejected": -10.416390419006348, + "step": 5370 + }, + { + "epoch": 0.32, + "learning_rate": 4.293426158994375e-06, + "logits/chosen": -2.9317848682403564, + "logits/rejected": -2.7879457473754883, + "logps/chosen": -73.31602478027344, + "logps/rejected": -1076.7596435546875, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09305614233016968, + "rewards/margins": 10.234904289245605, + "rewards/rejected": -10.327960014343262, + "step": 5380 + }, + { + "epoch": 0.32, + "learning_rate": 4.289796883638042e-06, + "logits/chosen": -2.9145305156707764, + "logits/rejected": -2.869666337966919, + "logps/chosen": -79.21398162841797, + "logps/rejected": -976.82373046875, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.059015024453401566, + "rewards/margins": 9.27155876159668, + "rewards/rejected": -9.330573081970215, + "step": 5390 + }, + { + "epoch": 0.32, + "learning_rate": 4.2861598538104255e-06, + "logits/chosen": -2.918776035308838, + "logits/rejected": -2.799042224884033, + "logps/chosen": -92.83975219726562, + "logps/rejected": -1122.894775390625, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2426188886165619, + "rewards/margins": 10.524441719055176, + "rewards/rejected": -10.767061233520508, + "step": 5400 + }, + { + "epoch": 0.32, + "learning_rate": 4.282515085269315e-06, + "logits/chosen": -2.941538095474243, + "logits/rejected": -2.8051202297210693, + "logps/chosen": -98.79366302490234, + "logps/rejected": -1020.1435546875, + "loss": 0.0337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.257463276386261, + "rewards/margins": 9.506620407104492, + "rewards/rejected": -9.76408576965332, + "step": 5410 + }, + { + "epoch": 0.32, + "learning_rate": 4.278862593806029e-06, + "logits/chosen": -2.9192214012145996, + "logits/rejected": -2.7784857749938965, + "logps/chosen": -83.96290588378906, + "logps/rejected": -964.5505981445312, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1542671024799347, + "rewards/margins": 9.048551559448242, + "rewards/rejected": -9.202820777893066, + "step": 5420 + }, + { + "epoch": 0.32, + "learning_rate": 4.275202395245346e-06, + "logits/chosen": -2.905764102935791, + "logits/rejected": -2.805637836456299, + "logps/chosen": -103.6803970336914, + "logps/rejected": -1055.0147705078125, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.293278306722641, + "rewards/margins": 9.819478988647461, + "rewards/rejected": -10.11275863647461, + "step": 5430 + }, + { + "epoch": 0.32, + "learning_rate": 4.271534505445438e-06, + "logits/chosen": -2.935650110244751, + "logits/rejected": -2.779548406600952, + "logps/chosen": -82.7082290649414, + "logps/rejected": -1142.6175537109375, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0751660019159317, + "rewards/margins": 10.8822021484375, + "rewards/rejected": -10.957368850708008, + "step": 5440 + }, + { + "epoch": 0.32, + "learning_rate": 4.267858940297799e-06, + "logits/chosen": -2.8821425437927246, + "logits/rejected": -2.7712297439575195, + "logps/chosen": -89.7991943359375, + "logps/rejected": -1161.3721923828125, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2234000861644745, + "rewards/margins": 10.930200576782227, + "rewards/rejected": -11.153600692749023, + "step": 5450 + }, + { + "epoch": 0.33, + "learning_rate": 4.264175715727176e-06, + "logits/chosen": -2.9116759300231934, + "logits/rejected": -2.7574896812438965, + "logps/chosen": -92.06423950195312, + "logps/rejected": -1130.909912109375, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22101660072803497, + "rewards/margins": 10.647990226745605, + "rewards/rejected": -10.869007110595703, + "step": 5460 + }, + { + "epoch": 0.33, + "learning_rate": 4.2604848476915015e-06, + "logits/chosen": -2.886748790740967, + "logits/rejected": -2.790830135345459, + "logps/chosen": -67.21646881103516, + "logps/rejected": -1200.1285400390625, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005697926972061396, + "rewards/margins": 11.564594268798828, + "rewards/rejected": -11.5588960647583, + "step": 5470 + }, + { + "epoch": 0.33, + "learning_rate": 4.256786352181827e-06, + "logits/chosen": -2.9011101722717285, + "logits/rejected": -2.834777593612671, + "logps/chosen": -81.32843017578125, + "logps/rejected": -1057.913818359375, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06897179037332535, + "rewards/margins": 10.05571460723877, + "rewards/rejected": -10.124686241149902, + "step": 5480 + }, + { + "epoch": 0.33, + "learning_rate": 4.253080245222246e-06, + "logits/chosen": -2.9348692893981934, + "logits/rejected": -2.856180429458618, + "logps/chosen": -72.4715576171875, + "logps/rejected": -1090.723388671875, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.025209635496139526, + "rewards/margins": 10.495555877685547, + "rewards/rejected": -10.470346450805664, + "step": 5490 + }, + { + "epoch": 0.33, + "learning_rate": 4.249366542869835e-06, + "logits/chosen": -2.928931474685669, + "logits/rejected": -2.8061206340789795, + "logps/chosen": -99.90522766113281, + "logps/rejected": -1051.652099609375, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.270371675491333, + "rewards/margins": 9.804542541503906, + "rewards/rejected": -10.074914932250977, + "step": 5500 + }, + { + "epoch": 0.33, + "learning_rate": 4.245645261214572e-06, + "logits/chosen": -2.9076218605041504, + "logits/rejected": -2.793264389038086, + "logps/chosen": -91.87276458740234, + "logps/rejected": -1218.15234375, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16891315579414368, + "rewards/margins": 11.567235946655273, + "rewards/rejected": -11.736149787902832, + "step": 5510 + }, + { + "epoch": 0.33, + "learning_rate": 4.24191641637928e-06, + "logits/chosen": -2.9241397380828857, + "logits/rejected": -2.8189492225646973, + "logps/chosen": -87.92935943603516, + "logps/rejected": -1165.9312744140625, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17771950364112854, + "rewards/margins": 11.028741836547852, + "rewards/rejected": -11.206462860107422, + "step": 5520 + }, + { + "epoch": 0.33, + "learning_rate": 4.238180024519543e-06, + "logits/chosen": -2.9220926761627197, + "logits/rejected": -2.812472343444824, + "logps/chosen": -78.54875183105469, + "logps/rejected": -1018.8239135742188, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08486328274011612, + "rewards/margins": 9.656705856323242, + "rewards/rejected": -9.741570472717285, + "step": 5530 + }, + { + "epoch": 0.33, + "learning_rate": 4.234436101823648e-06, + "logits/chosen": -2.919038772583008, + "logits/rejected": -2.8116660118103027, + "logps/chosen": -66.53028869628906, + "logps/rejected": -1173.835693359375, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007306198589503765, + "rewards/margins": 11.290348052978516, + "rewards/rejected": -11.283041000366211, + "step": 5540 + }, + { + "epoch": 0.33, + "learning_rate": 4.230684664512509e-06, + "logits/chosen": -2.932173013687134, + "logits/rejected": -2.8377366065979004, + "logps/chosen": -66.55361938476562, + "logps/rejected": -989.4749145507812, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05417945235967636, + "rewards/margins": 9.51244831085205, + "rewards/rejected": -9.458269119262695, + "step": 5550 + }, + { + "epoch": 0.33, + "learning_rate": 4.226925728839598e-06, + "logits/chosen": -2.9452261924743652, + "logits/rejected": -2.8142213821411133, + "logps/chosen": -99.51521301269531, + "logps/rejected": -1074.3839111328125, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2224973440170288, + "rewards/margins": 10.062762260437012, + "rewards/rejected": -10.285259246826172, + "step": 5560 + }, + { + "epoch": 0.33, + "learning_rate": 4.223159311090874e-06, + "logits/chosen": -2.8924169540405273, + "logits/rejected": -2.756124496459961, + "logps/chosen": -96.50779724121094, + "logps/rejected": -1127.698486328125, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23888087272644043, + "rewards/margins": 10.595687866210938, + "rewards/rejected": -10.834568977355957, + "step": 5570 + }, + { + "epoch": 0.33, + "learning_rate": 4.2193854275847115e-06, + "logits/chosen": -2.9521889686584473, + "logits/rejected": -2.8155055046081543, + "logps/chosen": -83.12867736816406, + "logps/rejected": -1134.474853515625, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16818618774414062, + "rewards/margins": 10.723310470581055, + "rewards/rejected": -10.891496658325195, + "step": 5580 + }, + { + "epoch": 0.33, + "learning_rate": 4.215604094671835e-06, + "logits/chosen": -2.935363292694092, + "logits/rejected": -2.823221206665039, + "logps/chosen": -113.07855224609375, + "logps/rejected": -1123.1505126953125, + "loss": 0.0279, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.42306798696517944, + "rewards/margins": 10.36509895324707, + "rewards/rejected": -10.788165092468262, + "step": 5590 + }, + { + "epoch": 0.33, + "learning_rate": 4.211815328735239e-06, + "logits/chosen": -2.902005434036255, + "logits/rejected": -2.788292169570923, + "logps/chosen": -89.6695327758789, + "logps/rejected": -971.9357299804688, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18500728905200958, + "rewards/margins": 9.09084415435791, + "rewards/rejected": -9.27585220336914, + "step": 5600 + }, + { + "epoch": 0.33, + "learning_rate": 4.208019146190127e-06, + "logits/chosen": -2.8818888664245605, + "logits/rejected": -2.8172061443328857, + "logps/chosen": -80.78018951416016, + "logps/rejected": -1142.979248046875, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10595504939556122, + "rewards/margins": 10.88661003112793, + "rewards/rejected": -10.992565155029297, + "step": 5610 + }, + { + "epoch": 0.34, + "learning_rate": 4.204215563483833e-06, + "logits/chosen": -2.9177119731903076, + "logits/rejected": -2.8300135135650635, + "logps/chosen": -100.86442565917969, + "logps/rejected": -871.18798828125, + "loss": 0.083, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.36082345247268677, + "rewards/margins": 7.9124860763549805, + "rewards/rejected": -8.273309707641602, + "step": 5620 + }, + { + "epoch": 0.34, + "learning_rate": 4.200404597095754e-06, + "logits/chosen": -2.903635025024414, + "logits/rejected": -2.780912399291992, + "logps/chosen": -128.10079956054688, + "logps/rejected": -951.4157104492188, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5435784459114075, + "rewards/margins": 8.542450904846191, + "rewards/rejected": -9.086029052734375, + "step": 5630 + }, + { + "epoch": 0.34, + "learning_rate": 4.196586263537277e-06, + "logits/chosen": -2.8814871311187744, + "logits/rejected": -2.780014753341675, + "logps/chosen": -85.3988037109375, + "logps/rejected": -1053.209716796875, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10892568528652191, + "rewards/margins": 9.975194931030273, + "rewards/rejected": -10.084120750427246, + "step": 5640 + }, + { + "epoch": 0.34, + "learning_rate": 4.192760579351708e-06, + "logits/chosen": -2.925739049911499, + "logits/rejected": -2.7880098819732666, + "logps/chosen": -78.75343322753906, + "logps/rejected": -1094.351806640625, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09332330524921417, + "rewards/margins": 10.394277572631836, + "rewards/rejected": -10.487602233886719, + "step": 5650 + }, + { + "epoch": 0.34, + "learning_rate": 4.188927561114201e-06, + "logits/chosen": -2.9139280319213867, + "logits/rejected": -2.7899022102355957, + "logps/chosen": -76.35122680664062, + "logps/rejected": -1034.1761474609375, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05266835168004036, + "rewards/margins": 9.847970962524414, + "rewards/rejected": -9.900638580322266, + "step": 5660 + }, + { + "epoch": 0.34, + "learning_rate": 4.185087225431686e-06, + "logits/chosen": -2.919797420501709, + "logits/rejected": -2.8353352546691895, + "logps/chosen": -85.43548583984375, + "logps/rejected": -1090.5357666015625, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17850326001644135, + "rewards/margins": 10.278223991394043, + "rewards/rejected": -10.456727981567383, + "step": 5670 + }, + { + "epoch": 0.34, + "learning_rate": 4.181239588942793e-06, + "logits/chosen": -2.9361441135406494, + "logits/rejected": -2.7996506690979004, + "logps/chosen": -80.62799072265625, + "logps/rejected": -969.0096435546875, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027362119406461716, + "rewards/margins": 9.228143692016602, + "rewards/rejected": -9.255505561828613, + "step": 5680 + }, + { + "epoch": 0.34, + "learning_rate": 4.177384668317788e-06, + "logits/chosen": -2.930062770843506, + "logits/rejected": -2.8270697593688965, + "logps/chosen": -75.63851928710938, + "logps/rejected": -941.7664184570312, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07435659319162369, + "rewards/margins": 8.918901443481445, + "rewards/rejected": -8.993257522583008, + "step": 5690 + }, + { + "epoch": 0.34, + "learning_rate": 4.173522480258494e-06, + "logits/chosen": -2.8649230003356934, + "logits/rejected": -2.785822868347168, + "logps/chosen": -87.01467895507812, + "logps/rejected": -977.3831176757812, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14952056109905243, + "rewards/margins": 9.178632736206055, + "rewards/rejected": -9.328152656555176, + "step": 5700 + }, + { + "epoch": 0.34, + "learning_rate": 4.1696530414982225e-06, + "logits/chosen": -2.9223949909210205, + "logits/rejected": -2.8064534664154053, + "logps/chosen": -88.6563491821289, + "logps/rejected": -1160.025390625, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13313785195350647, + "rewards/margins": 11.01543140411377, + "rewards/rejected": -11.148569107055664, + "step": 5710 + }, + { + "epoch": 0.34, + "learning_rate": 4.165776368801695e-06, + "logits/chosen": -2.9090263843536377, + "logits/rejected": -2.803642988204956, + "logps/chosen": -86.6533203125, + "logps/rejected": -1075.635498046875, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1997295320034027, + "rewards/margins": 10.107152938842773, + "rewards/rejected": -10.306882858276367, + "step": 5720 + }, + { + "epoch": 0.34, + "learning_rate": 4.16189247896498e-06, + "logits/chosen": -2.9081945419311523, + "logits/rejected": -2.8132612705230713, + "logps/chosen": -105.81819915771484, + "logps/rejected": -1061.7763671875, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34442442655563354, + "rewards/margins": 9.838922500610352, + "rewards/rejected": -10.183345794677734, + "step": 5730 + }, + { + "epoch": 0.34, + "learning_rate": 4.1580013888154126e-06, + "logits/chosen": -2.899080276489258, + "logits/rejected": -2.7419142723083496, + "logps/chosen": -93.13218688964844, + "logps/rejected": -1127.587646484375, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23785266280174255, + "rewards/margins": 10.599491119384766, + "rewards/rejected": -10.837343215942383, + "step": 5740 + }, + { + "epoch": 0.34, + "learning_rate": 4.154103115211523e-06, + "logits/chosen": -2.890242099761963, + "logits/rejected": -2.759457588195801, + "logps/chosen": -108.1460952758789, + "logps/rejected": -1022.3095703125, + "loss": 0.0596, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4143275320529938, + "rewards/margins": 9.379884719848633, + "rewards/rejected": -9.794212341308594, + "step": 5750 + }, + { + "epoch": 0.34, + "learning_rate": 4.150197675042966e-06, + "logits/chosen": -2.9226062297821045, + "logits/rejected": -2.8194291591644287, + "logps/chosen": -89.87496185302734, + "logps/rejected": -1136.595458984375, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21833351254463196, + "rewards/margins": 10.690470695495605, + "rewards/rejected": -10.908803939819336, + "step": 5760 + }, + { + "epoch": 0.34, + "learning_rate": 4.146285085230447e-06, + "logits/chosen": -2.8766026496887207, + "logits/rejected": -2.7943215370178223, + "logps/chosen": -93.8062973022461, + "logps/rejected": -1165.0740966796875, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20195484161376953, + "rewards/margins": 11.018232345581055, + "rewards/rejected": -11.220187187194824, + "step": 5770 + }, + { + "epoch": 0.34, + "learning_rate": 4.1423653627256445e-06, + "logits/chosen": -2.925827741622925, + "logits/rejected": -2.7985169887542725, + "logps/chosen": -98.32868957519531, + "logps/rejected": -1049.890869140625, + "loss": 0.0547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3201757073402405, + "rewards/margins": 9.73574447631836, + "rewards/rejected": -10.055920600891113, + "step": 5780 + }, + { + "epoch": 0.35, + "learning_rate": 4.138438524511145e-06, + "logits/chosen": -2.916382074356079, + "logits/rejected": -2.805530071258545, + "logps/chosen": -84.35920715332031, + "logps/rejected": -1072.8790283203125, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15996138751506805, + "rewards/margins": 10.127336502075195, + "rewards/rejected": -10.287299156188965, + "step": 5790 + }, + { + "epoch": 0.35, + "learning_rate": 4.134504587600359e-06, + "logits/chosen": -2.891566276550293, + "logits/rejected": -2.7598204612731934, + "logps/chosen": -85.30905151367188, + "logps/rejected": -1081.630615234375, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13826137781143188, + "rewards/margins": 10.221885681152344, + "rewards/rejected": -10.360146522521973, + "step": 5800 + }, + { + "epoch": 0.35, + "learning_rate": 4.130563569037458e-06, + "logits/chosen": -2.9300613403320312, + "logits/rejected": -2.80314302444458, + "logps/chosen": -81.52914428710938, + "logps/rejected": -1069.8724365234375, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04921416938304901, + "rewards/margins": 10.200726509094238, + "rewards/rejected": -10.249940872192383, + "step": 5810 + }, + { + "epoch": 0.35, + "learning_rate": 4.126615485897292e-06, + "logits/chosen": -2.885714054107666, + "logits/rejected": -2.8170981407165527, + "logps/chosen": -81.49369812011719, + "logps/rejected": -1060.9339599609375, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13677053153514862, + "rewards/margins": 10.032251358032227, + "rewards/rejected": -10.169021606445312, + "step": 5820 + }, + { + "epoch": 0.35, + "learning_rate": 4.12266035528532e-06, + "logits/chosen": -2.8987393379211426, + "logits/rejected": -2.8095765113830566, + "logps/chosen": -73.6042709350586, + "logps/rejected": -1113.3331298828125, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019057368859648705, + "rewards/margins": 10.670315742492676, + "rewards/rejected": -10.689372062683105, + "step": 5830 + }, + { + "epoch": 0.35, + "learning_rate": 4.118698194337536e-06, + "logits/chosen": -2.916578531265259, + "logits/rejected": -2.8023016452789307, + "logps/chosen": -72.79944610595703, + "logps/rejected": -992.8082275390625, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029437948018312454, + "rewards/margins": 9.446737289428711, + "rewards/rejected": -9.476176261901855, + "step": 5840 + }, + { + "epoch": 0.35, + "learning_rate": 4.114729020220392e-06, + "logits/chosen": -2.9200024604797363, + "logits/rejected": -2.7916064262390137, + "logps/chosen": -107.86048889160156, + "logps/rejected": -931.02001953125, + "loss": 0.0937, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3610355257987976, + "rewards/margins": 8.515572547912598, + "rewards/rejected": -8.876606941223145, + "step": 5850 + }, + { + "epoch": 0.35, + "learning_rate": 4.110752850130724e-06, + "logits/chosen": -2.93218994140625, + "logits/rejected": -2.77553129196167, + "logps/chosen": -101.95326232910156, + "logps/rejected": -1144.560302734375, + "loss": 0.0442, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2615772783756256, + "rewards/margins": 10.757929801940918, + "rewards/rejected": -11.019506454467773, + "step": 5860 + }, + { + "epoch": 0.35, + "learning_rate": 4.106769701295683e-06, + "logits/chosen": -2.9479477405548096, + "logits/rejected": -2.83124041557312, + "logps/chosen": -90.23303985595703, + "logps/rejected": -1024.789306640625, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21872690320014954, + "rewards/margins": 9.591408729553223, + "rewards/rejected": -9.810136795043945, + "step": 5870 + }, + { + "epoch": 0.35, + "learning_rate": 4.102779590972652e-06, + "logits/chosen": -2.897775173187256, + "logits/rejected": -2.7773444652557373, + "logps/chosen": -97.25646209716797, + "logps/rejected": -1119.082275390625, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21951651573181152, + "rewards/margins": 10.537130355834961, + "rewards/rejected": -10.756647109985352, + "step": 5880 + }, + { + "epoch": 0.35, + "learning_rate": 4.098782536449179e-06, + "logits/chosen": -2.8982994556427, + "logits/rejected": -2.7915964126586914, + "logps/chosen": -121.70416259765625, + "logps/rejected": -912.2566528320312, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5237027406692505, + "rewards/margins": 8.154032707214355, + "rewards/rejected": -8.677735328674316, + "step": 5890 + }, + { + "epoch": 0.35, + "learning_rate": 4.094778555042893e-06, + "logits/chosen": -2.890817165374756, + "logits/rejected": -2.7857794761657715, + "logps/chosen": -106.4247055053711, + "logps/rejected": -1102.311279296875, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3361923396587372, + "rewards/margins": 10.237695693969727, + "rewards/rejected": -10.573888778686523, + "step": 5900 + }, + { + "epoch": 0.35, + "learning_rate": 4.090767664101442e-06, + "logits/chosen": -2.8680167198181152, + "logits/rejected": -2.7525858879089355, + "logps/chosen": -126.20454406738281, + "logps/rejected": -1081.9871826171875, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5417619347572327, + "rewards/margins": 9.826483726501465, + "rewards/rejected": -10.368245124816895, + "step": 5910 + }, + { + "epoch": 0.35, + "learning_rate": 4.086749881002403e-06, + "logits/chosen": -2.9129996299743652, + "logits/rejected": -2.765171766281128, + "logps/chosen": -223.8513641357422, + "logps/rejected": -1133.186767578125, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.553107738494873, + "rewards/margins": 9.355761528015137, + "rewards/rejected": -10.908870697021484, + "step": 5920 + }, + { + "epoch": 0.35, + "learning_rate": 4.0827252231532185e-06, + "logits/chosen": -2.9506583213806152, + "logits/rejected": -2.7986555099487305, + "logps/chosen": -230.62399291992188, + "logps/rejected": -1140.5233154296875, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6065990924835205, + "rewards/margins": 9.366796493530273, + "rewards/rejected": -10.973396301269531, + "step": 5930 + }, + { + "epoch": 0.35, + "learning_rate": 4.078693707991115e-06, + "logits/chosen": -2.9007019996643066, + "logits/rejected": -2.7728428840637207, + "logps/chosen": -212.84423828125, + "logps/rejected": -1170.5079345703125, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4260632991790771, + "rewards/margins": 9.83821964263916, + "rewards/rejected": -11.2642822265625, + "step": 5940 + }, + { + "epoch": 0.35, + "learning_rate": 4.0746553529830274e-06, + "logits/chosen": -2.94193959236145, + "logits/rejected": -2.7876224517822266, + "logps/chosen": -188.4014434814453, + "logps/rejected": -1101.51708984375, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1470121145248413, + "rewards/margins": 9.426497459411621, + "rewards/rejected": -10.573509216308594, + "step": 5950 + }, + { + "epoch": 0.36, + "learning_rate": 4.070610175625528e-06, + "logits/chosen": -2.944218397140503, + "logits/rejected": -2.8169147968292236, + "logps/chosen": -146.0027313232422, + "logps/rejected": -1183.237548828125, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7262595891952515, + "rewards/margins": 10.65246868133545, + "rewards/rejected": -11.378727912902832, + "step": 5960 + }, + { + "epoch": 0.36, + "learning_rate": 4.066558193444746e-06, + "logits/chosen": -2.890836715698242, + "logits/rejected": -2.8003313541412354, + "logps/chosen": -108.06514739990234, + "logps/rejected": -1042.223388671875, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4303262233734131, + "rewards/margins": 9.557116508483887, + "rewards/rejected": -9.987442970275879, + "step": 5970 + }, + { + "epoch": 0.36, + "learning_rate": 4.0624994239962935e-06, + "logits/chosen": -2.887263298034668, + "logits/rejected": -2.778682231903076, + "logps/chosen": -101.46379089355469, + "logps/rejected": -1073.271728515625, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.193510502576828, + "rewards/margins": 10.095321655273438, + "rewards/rejected": -10.288833618164062, + "step": 5980 + }, + { + "epoch": 0.36, + "learning_rate": 4.058433884865188e-06, + "logits/chosen": -2.9255692958831787, + "logits/rejected": -2.8124046325683594, + "logps/chosen": -88.64639282226562, + "logps/rejected": -1201.575927734375, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1960635483264923, + "rewards/margins": 11.367776870727539, + "rewards/rejected": -11.563840866088867, + "step": 5990 + }, + { + "epoch": 0.36, + "learning_rate": 4.0543615936657785e-06, + "logits/chosen": -2.878767967224121, + "logits/rejected": -2.718470573425293, + "logps/chosen": -99.90760040283203, + "logps/rejected": -1055.6903076171875, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2017739713191986, + "rewards/margins": 9.91982364654541, + "rewards/rejected": -10.121597290039062, + "step": 6000 + }, + { + "epoch": 0.36, + "learning_rate": 4.050282568041668e-06, + "logits/chosen": -2.9185707569122314, + "logits/rejected": -2.7540478706359863, + "logps/chosen": -92.38533782958984, + "logps/rejected": -1091.3779296875, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20589880645275116, + "rewards/margins": 10.27662181854248, + "rewards/rejected": -10.482521057128906, + "step": 6010 + }, + { + "epoch": 0.36, + "learning_rate": 4.046196825665638e-06, + "logits/chosen": -2.887535810470581, + "logits/rejected": -2.793731689453125, + "logps/chosen": -86.35167694091797, + "logps/rejected": -1050.620361328125, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2140572965145111, + "rewards/margins": 9.851717948913574, + "rewards/rejected": -10.065774917602539, + "step": 6020 + }, + { + "epoch": 0.36, + "learning_rate": 4.042104384239568e-06, + "logits/chosen": -2.9212846755981445, + "logits/rejected": -2.7784135341644287, + "logps/chosen": -98.52204895019531, + "logps/rejected": -1123.0556640625, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2850589156150818, + "rewards/margins": 10.500296592712402, + "rewards/rejected": -10.785355567932129, + "step": 6030 + }, + { + "epoch": 0.36, + "learning_rate": 4.038005261494364e-06, + "logits/chosen": -2.941490650177002, + "logits/rejected": -2.8238582611083984, + "logps/chosen": -94.63851928710938, + "logps/rejected": -1045.890869140625, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25241321325302124, + "rewards/margins": 9.768071174621582, + "rewards/rejected": -10.020485877990723, + "step": 6040 + }, + { + "epoch": 0.36, + "learning_rate": 4.033899475189877e-06, + "logits/chosen": -2.9170010089874268, + "logits/rejected": -2.7927098274230957, + "logps/chosen": -84.85941314697266, + "logps/rejected": -1106.8258056640625, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16339339315891266, + "rewards/margins": 10.454751014709473, + "rewards/rejected": -10.618144989013672, + "step": 6050 + }, + { + "epoch": 0.36, + "learning_rate": 4.029787043114835e-06, + "logits/chosen": -2.912188768386841, + "logits/rejected": -2.7875545024871826, + "logps/chosen": -84.82963562011719, + "logps/rejected": -983.8064575195312, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1228787899017334, + "rewards/margins": 9.286172866821289, + "rewards/rejected": -9.409051895141602, + "step": 6060 + }, + { + "epoch": 0.36, + "learning_rate": 4.025667983086753e-06, + "logits/chosen": -2.919534683227539, + "logits/rejected": -2.8064064979553223, + "logps/chosen": -100.1720199584961, + "logps/rejected": -1088.806396484375, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3036212623119354, + "rewards/margins": 10.14475154876709, + "rewards/rejected": -10.448372840881348, + "step": 6070 + }, + { + "epoch": 0.36, + "learning_rate": 4.021542312951862e-06, + "logits/chosen": -2.9426026344299316, + "logits/rejected": -2.8380138874053955, + "logps/chosen": -74.31979370117188, + "logps/rejected": -1067.9876708984375, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029753098264336586, + "rewards/margins": 10.277753829956055, + "rewards/rejected": -10.248002052307129, + "step": 6080 + }, + { + "epoch": 0.36, + "learning_rate": 4.017410050585038e-06, + "logits/chosen": -2.901204824447632, + "logits/rejected": -2.795081615447998, + "logps/chosen": -72.08843231201172, + "logps/rejected": -1173.5740966796875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.049670178443193436, + "rewards/margins": 11.246772766113281, + "rewards/rejected": -11.296442031860352, + "step": 6090 + }, + { + "epoch": 0.36, + "learning_rate": 4.013271213889712e-06, + "logits/chosen": -2.9069836139678955, + "logits/rejected": -2.7805657386779785, + "logps/chosen": -75.16117858886719, + "logps/rejected": -1062.703125, + "loss": 0.0278, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.014051372185349464, + "rewards/margins": 10.184760093688965, + "rewards/rejected": -10.198812484741211, + "step": 6100 + }, + { + "epoch": 0.36, + "learning_rate": 4.009125820797802e-06, + "logits/chosen": -2.929468870162964, + "logits/rejected": -2.810861110687256, + "logps/chosen": -80.66822814941406, + "logps/rejected": -1155.5316162109375, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08166153728961945, + "rewards/margins": 11.017230033874512, + "rewards/rejected": -11.098891258239746, + "step": 6110 + }, + { + "epoch": 0.36, + "learning_rate": 4.0049738892696345e-06, + "logits/chosen": -2.8765709400177, + "logits/rejected": -2.7566983699798584, + "logps/chosen": -86.41004180908203, + "logps/rejected": -1207.1488037109375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1773633509874344, + "rewards/margins": 11.451537132263184, + "rewards/rejected": -11.628900527954102, + "step": 6120 + }, + { + "epoch": 0.37, + "learning_rate": 4.000815437293858e-06, + "logits/chosen": -2.8686881065368652, + "logits/rejected": -2.7436630725860596, + "logps/chosen": -107.51835632324219, + "logps/rejected": -1247.323486328125, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32581138610839844, + "rewards/margins": 11.695051193237305, + "rewards/rejected": -12.020861625671387, + "step": 6130 + }, + { + "epoch": 0.37, + "learning_rate": 3.996650482887377e-06, + "logits/chosen": -2.9623053073883057, + "logits/rejected": -2.809072971343994, + "logps/chosen": -154.9795684814453, + "logps/rejected": -1202.623291015625, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8094560503959656, + "rewards/margins": 10.774508476257324, + "rewards/rejected": -11.583965301513672, + "step": 6140 + }, + { + "epoch": 0.37, + "learning_rate": 3.992479044095267e-06, + "logits/chosen": -2.91528058052063, + "logits/rejected": -2.7518434524536133, + "logps/chosen": -139.6954345703125, + "logps/rejected": -1149.312744140625, + "loss": 0.0259, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7149208784103394, + "rewards/margins": 10.353741645812988, + "rewards/rejected": -11.068662643432617, + "step": 6150 + }, + { + "epoch": 0.37, + "learning_rate": 3.988301138990697e-06, + "logits/chosen": -2.910979986190796, + "logits/rejected": -2.789787769317627, + "logps/chosen": -173.71800231933594, + "logps/rejected": -1156.4022216796875, + "loss": 0.03, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1050055027008057, + "rewards/margins": 10.007904052734375, + "rewards/rejected": -11.112909317016602, + "step": 6160 + }, + { + "epoch": 0.37, + "learning_rate": 3.984116785674852e-06, + "logits/chosen": -2.9205832481384277, + "logits/rejected": -2.758943557739258, + "logps/chosen": -134.47640991210938, + "logps/rejected": -1116.061767578125, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6752219200134277, + "rewards/margins": 10.03821849822998, + "rewards/rejected": -10.713440895080566, + "step": 6170 + }, + { + "epoch": 0.37, + "learning_rate": 3.979926002276856e-06, + "logits/chosen": -2.9152002334594727, + "logits/rejected": -2.8041398525238037, + "logps/chosen": -86.02631378173828, + "logps/rejected": -1053.052001953125, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11347518116235733, + "rewards/margins": 9.975103378295898, + "rewards/rejected": -10.088579177856445, + "step": 6180 + }, + { + "epoch": 0.37, + "learning_rate": 3.97572880695369e-06, + "logits/chosen": -2.932382583618164, + "logits/rejected": -2.7951180934906006, + "logps/chosen": -83.01848602294922, + "logps/rejected": -1027.914794921875, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13890673220157623, + "rewards/margins": 9.708002090454102, + "rewards/rejected": -9.846907615661621, + "step": 6190 + }, + { + "epoch": 0.37, + "learning_rate": 3.971525217890117e-06, + "logits/chosen": -2.891970634460449, + "logits/rejected": -2.774087429046631, + "logps/chosen": -92.13037109375, + "logps/rejected": -1031.8250732421875, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19881696999073029, + "rewards/margins": 9.660085678100586, + "rewards/rejected": -9.858903884887695, + "step": 6200 + }, + { + "epoch": 0.37, + "learning_rate": 3.967315253298599e-06, + "logits/chosen": -2.9005608558654785, + "logits/rejected": -2.8113651275634766, + "logps/chosen": -98.0318603515625, + "logps/rejected": -1191.575439453125, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2886895537376404, + "rewards/margins": 11.196630477905273, + "rewards/rejected": -11.485318183898926, + "step": 6210 + }, + { + "epoch": 0.37, + "learning_rate": 3.963098931419223e-06, + "logits/chosen": -2.942368984222412, + "logits/rejected": -2.766812801361084, + "logps/chosen": -92.01741027832031, + "logps/rejected": -1096.832275390625, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22160692512989044, + "rewards/margins": 10.303544044494629, + "rewards/rejected": -10.525152206420898, + "step": 6220 + }, + { + "epoch": 0.37, + "learning_rate": 3.958876270519619e-06, + "logits/chosen": -2.871826648712158, + "logits/rejected": -2.7864551544189453, + "logps/chosen": -87.02898406982422, + "logps/rejected": -1130.223876953125, + "loss": 0.0586, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1553284376859665, + "rewards/margins": 10.7144193649292, + "rewards/rejected": -10.869749069213867, + "step": 6230 + }, + { + "epoch": 0.37, + "learning_rate": 3.9546472888948825e-06, + "logits/chosen": -2.9435770511627197, + "logits/rejected": -2.837279796600342, + "logps/chosen": -80.69771575927734, + "logps/rejected": -1076.6014404296875, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11250308901071548, + "rewards/margins": 10.206145286560059, + "rewards/rejected": -10.318647384643555, + "step": 6240 + }, + { + "epoch": 0.37, + "learning_rate": 3.950412004867491e-06, + "logits/chosen": -2.897233486175537, + "logits/rejected": -2.7567076683044434, + "logps/chosen": -81.30437469482422, + "logps/rejected": -1116.92822265625, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0483371838927269, + "rewards/margins": 10.688684463500977, + "rewards/rejected": -10.737021446228027, + "step": 6250 + }, + { + "epoch": 0.37, + "learning_rate": 3.94617043678723e-06, + "logits/chosen": -2.8798232078552246, + "logits/rejected": -2.764303684234619, + "logps/chosen": -81.736328125, + "logps/rejected": -973.8621826171875, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11787674576044083, + "rewards/margins": 9.178884506225586, + "rewards/rejected": -9.296760559082031, + "step": 6260 + }, + { + "epoch": 0.37, + "learning_rate": 3.941922603031113e-06, + "logits/chosen": -2.9101200103759766, + "logits/rejected": -2.793123722076416, + "logps/chosen": -83.25614929199219, + "logps/rejected": -1046.255126953125, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1816023290157318, + "rewards/margins": 9.83031940460205, + "rewards/rejected": -10.011922836303711, + "step": 6270 + }, + { + "epoch": 0.37, + "learning_rate": 3.937668522003295e-06, + "logits/chosen": -2.943380832672119, + "logits/rejected": -2.812248945236206, + "logps/chosen": -96.51483154296875, + "logps/rejected": -977.9375, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2522944509983063, + "rewards/margins": 9.086143493652344, + "rewards/rejected": -9.338438034057617, + "step": 6280 + }, + { + "epoch": 0.38, + "learning_rate": 3.933408212135003e-06, + "logits/chosen": -2.9029500484466553, + "logits/rejected": -2.8118367195129395, + "logps/chosen": -81.0311050415039, + "logps/rejected": -978.9517822265625, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13293835520744324, + "rewards/margins": 9.216623306274414, + "rewards/rejected": -9.349563598632812, + "step": 6290 + }, + { + "epoch": 0.38, + "learning_rate": 3.929141691884448e-06, + "logits/chosen": -2.935347080230713, + "logits/rejected": -2.792088031768799, + "logps/chosen": -80.49495697021484, + "logps/rejected": -1181.645751953125, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11081305891275406, + "rewards/margins": 11.2561616897583, + "rewards/rejected": -11.366973876953125, + "step": 6300 + }, + { + "epoch": 0.38, + "learning_rate": 3.9248689797367515e-06, + "logits/chosen": -2.89543080329895, + "logits/rejected": -2.8255739212036133, + "logps/chosen": -93.41390228271484, + "logps/rejected": -1153.826904296875, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2257152497768402, + "rewards/margins": 10.865880966186523, + "rewards/rejected": -11.091597557067871, + "step": 6310 + }, + { + "epoch": 0.38, + "learning_rate": 3.920590094203856e-06, + "logits/chosen": -2.930365800857544, + "logits/rejected": -2.8250584602355957, + "logps/chosen": -82.77169799804688, + "logps/rejected": -1098.818359375, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08564687520265579, + "rewards/margins": 10.462926864624023, + "rewards/rejected": -10.54857349395752, + "step": 6320 + }, + { + "epoch": 0.38, + "learning_rate": 3.916305053824458e-06, + "logits/chosen": -2.929466724395752, + "logits/rejected": -2.8033576011657715, + "logps/chosen": -89.52632141113281, + "logps/rejected": -1175.772216796875, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17079557478427887, + "rewards/margins": 11.128901481628418, + "rewards/rejected": -11.299696922302246, + "step": 6330 + }, + { + "epoch": 0.38, + "learning_rate": 3.912013877163916e-06, + "logits/chosen": -2.9159460067749023, + "logits/rejected": -2.781715154647827, + "logps/chosen": -75.20976257324219, + "logps/rejected": -1089.1168212890625, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04497598111629486, + "rewards/margins": 10.396872520446777, + "rewards/rejected": -10.441848754882812, + "step": 6340 + }, + { + "epoch": 0.38, + "learning_rate": 3.907716582814175e-06, + "logits/chosen": -2.913346767425537, + "logits/rejected": -2.8136565685272217, + "logps/chosen": -82.28569030761719, + "logps/rejected": -1120.493896484375, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12956306338310242, + "rewards/margins": 10.637325286865234, + "rewards/rejected": -10.766888618469238, + "step": 6350 + }, + { + "epoch": 0.38, + "learning_rate": 3.903413189393687e-06, + "logits/chosen": -2.8990001678466797, + "logits/rejected": -2.817972421646118, + "logps/chosen": -88.62944030761719, + "logps/rejected": -1043.7587890625, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19952502846717834, + "rewards/margins": 9.794733047485352, + "rewards/rejected": -9.994256973266602, + "step": 6360 + }, + { + "epoch": 0.38, + "learning_rate": 3.899103715547325e-06, + "logits/chosen": -2.9326694011688232, + "logits/rejected": -2.8338265419006348, + "logps/chosen": -74.43017578125, + "logps/rejected": -1132.8187255859375, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034858811646699905, + "rewards/margins": 10.850278854370117, + "rewards/rejected": -10.885136604309082, + "step": 6370 + }, + { + "epoch": 0.38, + "learning_rate": 3.894788179946313e-06, + "logits/chosen": -2.909954786300659, + "logits/rejected": -2.766792058944702, + "logps/chosen": -93.53961181640625, + "logps/rejected": -1024.513916015625, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1904674470424652, + "rewards/margins": 9.611539840698242, + "rewards/rejected": -9.802005767822266, + "step": 6380 + }, + { + "epoch": 0.38, + "learning_rate": 3.890466601288131e-06, + "logits/chosen": -2.9135093688964844, + "logits/rejected": -2.800748586654663, + "logps/chosen": -89.88471984863281, + "logps/rejected": -1001.25927734375, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2575301229953766, + "rewards/margins": 9.310220718383789, + "rewards/rejected": -9.56775188446045, + "step": 6390 + }, + { + "epoch": 0.38, + "learning_rate": 3.886138998296446e-06, + "logits/chosen": -2.9123740196228027, + "logits/rejected": -2.7747879028320312, + "logps/chosen": -78.06915283203125, + "logps/rejected": -1117.611328125, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11141111701726913, + "rewards/margins": 10.628135681152344, + "rewards/rejected": -10.739545822143555, + "step": 6400 + }, + { + "epoch": 0.38, + "learning_rate": 3.881805389721021e-06, + "logits/chosen": -2.9010398387908936, + "logits/rejected": -2.7781822681427, + "logps/chosen": -101.46094512939453, + "logps/rejected": -1099.253173828125, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2968367338180542, + "rewards/margins": 10.258384704589844, + "rewards/rejected": -10.555219650268555, + "step": 6410 + }, + { + "epoch": 0.38, + "learning_rate": 3.877465794337648e-06, + "logits/chosen": -2.9146225452423096, + "logits/rejected": -2.8289735317230225, + "logps/chosen": -91.29545593261719, + "logps/rejected": -1072.724853515625, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19682198762893677, + "rewards/margins": 10.085344314575195, + "rewards/rejected": -10.282166481018066, + "step": 6420 + }, + { + "epoch": 0.38, + "learning_rate": 3.873120230948045e-06, + "logits/chosen": -2.9187588691711426, + "logits/rejected": -2.827144145965576, + "logps/chosen": -98.31732177734375, + "logps/rejected": -910.05322265625, + "loss": 0.03, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32930025458335876, + "rewards/margins": 8.338726043701172, + "rewards/rejected": -8.668025970458984, + "step": 6430 + }, + { + "epoch": 0.38, + "learning_rate": 3.868768718379798e-06, + "logits/chosen": -2.924968719482422, + "logits/rejected": -2.7943320274353027, + "logps/chosen": -106.68647766113281, + "logps/rejected": -1153.1915283203125, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38858476281166077, + "rewards/margins": 10.690592765808105, + "rewards/rejected": -11.079178810119629, + "step": 6440 + }, + { + "epoch": 0.38, + "learning_rate": 3.8644112754862614e-06, + "logits/chosen": -2.9585347175598145, + "logits/rejected": -2.8262205123901367, + "logps/chosen": -130.94100952148438, + "logps/rejected": -1087.3109130859375, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5822890996932983, + "rewards/margins": 9.82852554321289, + "rewards/rejected": -10.410813331604004, + "step": 6450 + }, + { + "epoch": 0.39, + "learning_rate": 3.860047921146487e-06, + "logits/chosen": -2.9241621494293213, + "logits/rejected": -2.7909157276153564, + "logps/chosen": -87.95986938476562, + "logps/rejected": -1058.934814453125, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1660889834165573, + "rewards/margins": 9.99785327911377, + "rewards/rejected": -10.163942337036133, + "step": 6460 + }, + { + "epoch": 0.39, + "learning_rate": 3.855678674265136e-06, + "logits/chosen": -2.9131813049316406, + "logits/rejected": -2.792825937271118, + "logps/chosen": -91.43817901611328, + "logps/rejected": -1003.8660888671875, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2128903865814209, + "rewards/margins": 9.38998031616211, + "rewards/rejected": -9.602869987487793, + "step": 6470 + }, + { + "epoch": 0.39, + "learning_rate": 3.851303553772402e-06, + "logits/chosen": -2.939784526824951, + "logits/rejected": -2.811361789703369, + "logps/chosen": -82.83306884765625, + "logps/rejected": -1082.6319580078125, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15579429268836975, + "rewards/margins": 10.234354019165039, + "rewards/rejected": -10.390148162841797, + "step": 6480 + }, + { + "epoch": 0.39, + "learning_rate": 3.846922578623924e-06, + "logits/chosen": -2.9310402870178223, + "logits/rejected": -2.8119266033172607, + "logps/chosen": -88.42886352539062, + "logps/rejected": -1169.181884765625, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14256305992603302, + "rewards/margins": 11.109556198120117, + "rewards/rejected": -11.252120018005371, + "step": 6490 + }, + { + "epoch": 0.39, + "learning_rate": 3.84253576780071e-06, + "logits/chosen": -2.923973798751831, + "logits/rejected": -2.838120937347412, + "logps/chosen": -85.28412628173828, + "logps/rejected": -1032.6490478515625, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1481623500585556, + "rewards/margins": 9.737385749816895, + "rewards/rejected": -9.88554859161377, + "step": 6500 + }, + { + "epoch": 0.39, + "learning_rate": 3.83814314030905e-06, + "logits/chosen": -2.936239242553711, + "logits/rejected": -2.7988486289978027, + "logps/chosen": -64.92374420166016, + "logps/rejected": -1064.503173828125, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.041876696050167084, + "rewards/margins": 10.23698902130127, + "rewards/rejected": -10.195112228393555, + "step": 6510 + }, + { + "epoch": 0.39, + "learning_rate": 3.833744715180433e-06, + "logits/chosen": -2.9519200325012207, + "logits/rejected": -2.837407350540161, + "logps/chosen": -68.72590637207031, + "logps/rejected": -980.8690185546875, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005347815807908773, + "rewards/margins": 9.388530731201172, + "rewards/rejected": -9.383182525634766, + "step": 6520 + }, + { + "epoch": 0.39, + "learning_rate": 3.829340511471471e-06, + "logits/chosen": -2.9176063537597656, + "logits/rejected": -2.8307461738586426, + "logps/chosen": -62.59233856201172, + "logps/rejected": -1033.553955078125, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.101711705327034, + "rewards/margins": 10.011478424072266, + "rewards/rejected": -9.90976619720459, + "step": 6530 + }, + { + "epoch": 0.39, + "learning_rate": 3.824930548263811e-06, + "logits/chosen": -2.939133882522583, + "logits/rejected": -2.832871675491333, + "logps/chosen": -62.86103439331055, + "logps/rejected": -1058.087890625, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.080507293343544, + "rewards/margins": 10.208907127380371, + "rewards/rejected": -10.128398895263672, + "step": 6540 + }, + { + "epoch": 0.39, + "learning_rate": 3.82051484466405e-06, + "logits/chosen": -2.9525794982910156, + "logits/rejected": -2.791292428970337, + "logps/chosen": -67.79877471923828, + "logps/rejected": -1080.8424072265625, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04454438015818596, + "rewards/margins": 10.391596794128418, + "rewards/rejected": -10.347051620483398, + "step": 6550 + }, + { + "epoch": 0.39, + "learning_rate": 3.816093419803663e-06, + "logits/chosen": -2.9264256954193115, + "logits/rejected": -2.790048122406006, + "logps/chosen": -72.99713134765625, + "logps/rejected": -988.3656005859375, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011682920157909393, + "rewards/margins": 9.443254470825195, + "rewards/rejected": -9.444421768188477, + "step": 6560 + }, + { + "epoch": 0.39, + "learning_rate": 3.811666292838905e-06, + "logits/chosen": -2.914684534072876, + "logits/rejected": -2.783233165740967, + "logps/chosen": -96.10786437988281, + "logps/rejected": -1063.142333984375, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21540692448616028, + "rewards/margins": 9.973735809326172, + "rewards/rejected": -10.189143180847168, + "step": 6570 + }, + { + "epoch": 0.39, + "learning_rate": 3.8072334829507414e-06, + "logits/chosen": -2.905433177947998, + "logits/rejected": -2.7853519916534424, + "logps/chosen": -136.0424041748047, + "logps/rejected": -1073.2169189453125, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6832762956619263, + "rewards/margins": 9.608771324157715, + "rewards/rejected": -10.292046546936035, + "step": 6580 + }, + { + "epoch": 0.39, + "learning_rate": 3.802795009344757e-06, + "logits/chosen": -2.904794216156006, + "logits/rejected": -2.7682700157165527, + "logps/chosen": -110.67903900146484, + "logps/rejected": -1079.4990234375, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40114468336105347, + "rewards/margins": 9.956185340881348, + "rewards/rejected": -10.357330322265625, + "step": 6590 + }, + { + "epoch": 0.39, + "learning_rate": 3.798350891251076e-06, + "logits/chosen": -2.895101547241211, + "logits/rejected": -2.7678451538085938, + "logps/chosen": -105.27542877197266, + "logps/rejected": -1182.995361328125, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37807735800743103, + "rewards/margins": 11.017293930053711, + "rewards/rejected": -11.39537239074707, + "step": 6600 + }, + { + "epoch": 0.39, + "learning_rate": 3.7939011479242784e-06, + "logits/chosen": -2.9222826957702637, + "logits/rejected": -2.766177177429199, + "logps/chosen": -98.7292709350586, + "logps/rejected": -1169.2509765625, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29226866364479065, + "rewards/margins": 10.956949234008789, + "rewards/rejected": -11.249216079711914, + "step": 6610 + }, + { + "epoch": 0.39, + "learning_rate": 3.7894457986433143e-06, + "logits/chosen": -2.8875489234924316, + "logits/rejected": -2.7267744541168213, + "logps/chosen": -113.87646484375, + "logps/rejected": -1042.7359619140625, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46203166246414185, + "rewards/margins": 9.51615047454834, + "rewards/rejected": -9.978182792663574, + "step": 6620 + }, + { + "epoch": 0.4, + "learning_rate": 3.7849848627114248e-06, + "logits/chosen": -2.9139058589935303, + "logits/rejected": -2.7674803733825684, + "logps/chosen": -99.23222351074219, + "logps/rejected": -1149.800537109375, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29400184750556946, + "rewards/margins": 10.766439437866211, + "rewards/rejected": -11.060441970825195, + "step": 6630 + }, + { + "epoch": 0.4, + "learning_rate": 3.7805183594560525e-06, + "logits/chosen": -2.9445641040802, + "logits/rejected": -2.8059093952178955, + "logps/chosen": -124.96044921875, + "logps/rejected": -1100.9664306640625, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5462000966072083, + "rewards/margins": 10.025306701660156, + "rewards/rejected": -10.571507453918457, + "step": 6640 + }, + { + "epoch": 0.4, + "learning_rate": 3.7760463082287647e-06, + "logits/chosen": -2.9038233757019043, + "logits/rejected": -2.7581303119659424, + "logps/chosen": -120.37159729003906, + "logps/rejected": -1200.5157470703125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5187109708786011, + "rewards/margins": 11.042951583862305, + "rewards/rejected": -11.561662673950195, + "step": 6650 + }, + { + "epoch": 0.4, + "learning_rate": 3.7715687284051618e-06, + "logits/chosen": -2.9060845375061035, + "logits/rejected": -2.783130645751953, + "logps/chosen": -122.11412048339844, + "logps/rejected": -1204.497802734375, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5129600763320923, + "rewards/margins": 11.088891983032227, + "rewards/rejected": -11.601851463317871, + "step": 6660 + }, + { + "epoch": 0.4, + "learning_rate": 3.7670856393848e-06, + "logits/chosen": -2.9159538745880127, + "logits/rejected": -2.760282516479492, + "logps/chosen": -95.04017639160156, + "logps/rejected": -1020.05712890625, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23771195113658905, + "rewards/margins": 9.51791763305664, + "rewards/rejected": -9.755629539489746, + "step": 6670 + }, + { + "epoch": 0.4, + "learning_rate": 3.7625970605911038e-06, + "logits/chosen": -2.887974262237549, + "logits/rejected": -2.751300811767578, + "logps/chosen": -88.84022521972656, + "logps/rejected": -1258.835693359375, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20529845356941223, + "rewards/margins": 11.951395034790039, + "rewards/rejected": -12.156692504882812, + "step": 6680 + }, + { + "epoch": 0.4, + "learning_rate": 3.7581030114712837e-06, + "logits/chosen": -2.9165754318237305, + "logits/rejected": -2.782853364944458, + "logps/chosen": -113.78248596191406, + "logps/rejected": -1018.2673950195312, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41609930992126465, + "rewards/margins": 9.314772605895996, + "rewards/rejected": -9.730871200561523, + "step": 6690 + }, + { + "epoch": 0.4, + "learning_rate": 3.75360351149625e-06, + "logits/chosen": -2.9435460567474365, + "logits/rejected": -2.8024682998657227, + "logps/chosen": -77.84777069091797, + "logps/rejected": -1078.976318359375, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07346780598163605, + "rewards/margins": 10.270526885986328, + "rewards/rejected": -10.343994140625, + "step": 6700 + }, + { + "epoch": 0.4, + "learning_rate": 3.7490985801605303e-06, + "logits/chosen": -2.914401054382324, + "logits/rejected": -2.7792012691497803, + "logps/chosen": -78.41474914550781, + "logps/rejected": -1130.6934814453125, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06996846944093704, + "rewards/margins": 10.809919357299805, + "rewards/rejected": -10.879887580871582, + "step": 6710 + }, + { + "epoch": 0.4, + "learning_rate": 3.744588236982181e-06, + "logits/chosen": -2.926753044128418, + "logits/rejected": -2.788318157196045, + "logps/chosen": -87.95268249511719, + "logps/rejected": -1118.17138671875, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18708589673042297, + "rewards/margins": 10.554030418395996, + "rewards/rejected": -10.741117477416992, + "step": 6720 + }, + { + "epoch": 0.4, + "learning_rate": 3.7400725015027107e-06, + "logits/chosen": -2.903153896331787, + "logits/rejected": -2.8029251098632812, + "logps/chosen": -119.28309631347656, + "logps/rejected": -1237.8499755859375, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4689444899559021, + "rewards/margins": 11.4512300491333, + "rewards/rejected": -11.920174598693848, + "step": 6730 + }, + { + "epoch": 0.4, + "learning_rate": 3.7355513932869862e-06, + "logits/chosen": -2.9454221725463867, + "logits/rejected": -2.820990562438965, + "logps/chosen": -83.53551483154297, + "logps/rejected": -1159.2552490234375, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10593627393245697, + "rewards/margins": 11.049755096435547, + "rewards/rejected": -11.155692100524902, + "step": 6740 + }, + { + "epoch": 0.4, + "learning_rate": 3.7310249319231552e-06, + "logits/chosen": -2.9192848205566406, + "logits/rejected": -2.7833282947540283, + "logps/chosen": -79.68898010253906, + "logps/rejected": -1128.011962890625, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08301255851984024, + "rewards/margins": 10.753610610961914, + "rewards/rejected": -10.836623191833496, + "step": 6750 + }, + { + "epoch": 0.4, + "learning_rate": 3.726493137022557e-06, + "logits/chosen": -2.9085731506347656, + "logits/rejected": -2.812312602996826, + "logps/chosen": -85.72715759277344, + "logps/rejected": -1085.9427490234375, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12284183502197266, + "rewards/margins": 10.29987907409668, + "rewards/rejected": -10.422721862792969, + "step": 6760 + }, + { + "epoch": 0.4, + "learning_rate": 3.7219560282196397e-06, + "logits/chosen": -2.935290813446045, + "logits/rejected": -2.7731118202209473, + "logps/chosen": -119.27406311035156, + "logps/rejected": -1101.92333984375, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4734339714050293, + "rewards/margins": 10.102243423461914, + "rewards/rejected": -10.575677871704102, + "step": 6770 + }, + { + "epoch": 0.4, + "learning_rate": 3.7174136251718735e-06, + "logits/chosen": -2.9045538902282715, + "logits/rejected": -2.7873570919036865, + "logps/chosen": -138.05226135253906, + "logps/rejected": -1124.597412109375, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6557701230049133, + "rewards/margins": 10.145862579345703, + "rewards/rejected": -10.80163288116455, + "step": 6780 + }, + { + "epoch": 0.4, + "learning_rate": 3.712865947559667e-06, + "logits/chosen": -2.9085047245025635, + "logits/rejected": -2.768371820449829, + "logps/chosen": -101.27290344238281, + "logps/rejected": -1119.4437255859375, + "loss": 0.0308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24350428581237793, + "rewards/margins": 10.501436233520508, + "rewards/rejected": -10.744939804077148, + "step": 6790 + }, + { + "epoch": 0.41, + "learning_rate": 3.7083130150862835e-06, + "logits/chosen": -2.9560627937316895, + "logits/rejected": -2.828529119491577, + "logps/chosen": -76.32826232910156, + "logps/rejected": -1146.9981689453125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03673182427883148, + "rewards/margins": 10.98664379119873, + "rewards/rejected": -11.02337646484375, + "step": 6800 + }, + { + "epoch": 0.41, + "learning_rate": 3.7037548474777484e-06, + "logits/chosen": -2.9539592266082764, + "logits/rejected": -2.823580265045166, + "logps/chosen": -78.04996490478516, + "logps/rejected": -1050.046875, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025190208107233047, + "rewards/margins": 10.027109146118164, + "rewards/rejected": -10.052297592163086, + "step": 6810 + }, + { + "epoch": 0.41, + "learning_rate": 3.6991914644827732e-06, + "logits/chosen": -2.93625807762146, + "logits/rejected": -2.7904951572418213, + "logps/chosen": -73.10401916503906, + "logps/rejected": -1024.831298828125, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03253782540559769, + "rewards/margins": 9.767110824584961, + "rewards/rejected": -9.79964828491211, + "step": 6820 + }, + { + "epoch": 0.41, + "learning_rate": 3.6946228858726642e-06, + "logits/chosen": -2.9074456691741943, + "logits/rejected": -2.7821357250213623, + "logps/chosen": -101.6294174194336, + "logps/rejected": -1086.5552978515625, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32821211218833923, + "rewards/margins": 10.076467514038086, + "rewards/rejected": -10.404680252075195, + "step": 6830 + }, + { + "epoch": 0.41, + "learning_rate": 3.690049131441238e-06, + "logits/chosen": -2.9105629920959473, + "logits/rejected": -2.8157317638397217, + "logps/chosen": -96.59138488769531, + "logps/rejected": -1192.170654296875, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2676796317100525, + "rewards/margins": 11.207104682922363, + "rewards/rejected": -11.474782943725586, + "step": 6840 + }, + { + "epoch": 0.41, + "learning_rate": 3.6854702210047353e-06, + "logits/chosen": -2.9104385375976562, + "logits/rejected": -2.787836790084839, + "logps/chosen": -94.45555877685547, + "logps/rejected": -956.9180908203125, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24695566296577454, + "rewards/margins": 8.900993347167969, + "rewards/rejected": -9.14794921875, + "step": 6850 + }, + { + "epoch": 0.41, + "learning_rate": 3.6808861744017386e-06, + "logits/chosen": -2.904888391494751, + "logits/rejected": -2.7817986011505127, + "logps/chosen": -86.03636169433594, + "logps/rejected": -1182.254638671875, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14749349653720856, + "rewards/margins": 11.233137130737305, + "rewards/rejected": -11.380630493164062, + "step": 6860 + }, + { + "epoch": 0.41, + "learning_rate": 3.6762970114930796e-06, + "logits/chosen": -2.9094462394714355, + "logits/rejected": -2.8107056617736816, + "logps/chosen": -103.11491394042969, + "logps/rejected": -1092.7376708984375, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32972320914268494, + "rewards/margins": 10.160021781921387, + "rewards/rejected": -10.489745140075684, + "step": 6870 + }, + { + "epoch": 0.41, + "learning_rate": 3.6717027521617593e-06, + "logits/chosen": -2.9403204917907715, + "logits/rejected": -2.775780200958252, + "logps/chosen": -113.1695785522461, + "logps/rejected": -1159.4503173828125, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3750307261943817, + "rewards/margins": 10.771791458129883, + "rewards/rejected": -11.146821975708008, + "step": 6880 + }, + { + "epoch": 0.41, + "learning_rate": 3.6671034163128594e-06, + "logits/chosen": -2.9391627311706543, + "logits/rejected": -2.794499635696411, + "logps/chosen": -90.55206298828125, + "logps/rejected": -1018.2598876953125, + "loss": 0.1077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21661922335624695, + "rewards/margins": 9.516180992126465, + "rewards/rejected": -9.732800483703613, + "step": 6890 + }, + { + "epoch": 0.41, + "learning_rate": 3.662499023873454e-06, + "logits/chosen": -2.8947079181671143, + "logits/rejected": -2.8041396141052246, + "logps/chosen": -95.18833923339844, + "logps/rejected": -1077.7174072265625, + "loss": 0.0326, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2365306168794632, + "rewards/margins": 10.094181060791016, + "rewards/rejected": -10.330713272094727, + "step": 6900 + }, + { + "epoch": 0.41, + "learning_rate": 3.657889594792528e-06, + "logits/chosen": -2.9660799503326416, + "logits/rejected": -2.8214352130889893, + "logps/chosen": -86.84017181396484, + "logps/rejected": -900.7272338867188, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12992525100708008, + "rewards/margins": 8.428224563598633, + "rewards/rejected": -8.558150291442871, + "step": 6910 + }, + { + "epoch": 0.41, + "learning_rate": 3.653275149040887e-06, + "logits/chosen": -2.9090256690979004, + "logits/rejected": -2.809492588043213, + "logps/chosen": -99.82324981689453, + "logps/rejected": -949.4283447265625, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24338094890117645, + "rewards/margins": 8.810364723205566, + "rewards/rejected": -9.05374526977539, + "step": 6920 + }, + { + "epoch": 0.41, + "learning_rate": 3.6486557066110694e-06, + "logits/chosen": -2.9437060356140137, + "logits/rejected": -2.8253989219665527, + "logps/chosen": -78.02464294433594, + "logps/rejected": -1014.4766845703125, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07109050452709198, + "rewards/margins": 9.623019218444824, + "rewards/rejected": -9.694109916687012, + "step": 6930 + }, + { + "epoch": 0.41, + "learning_rate": 3.644031287517267e-06, + "logits/chosen": -2.915351152420044, + "logits/rejected": -2.765439748764038, + "logps/chosen": -106.92597961425781, + "logps/rejected": -1165.0740966796875, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3702332675457001, + "rewards/margins": 10.828544616699219, + "rewards/rejected": -11.198777198791504, + "step": 6940 + }, + { + "epoch": 0.41, + "learning_rate": 3.639401911795232e-06, + "logits/chosen": -2.9155404567718506, + "logits/rejected": -2.7839725017547607, + "logps/chosen": -99.48594665527344, + "logps/rejected": -1071.2060546875, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2706945538520813, + "rewards/margins": 9.996676445007324, + "rewards/rejected": -10.267372131347656, + "step": 6950 + }, + { + "epoch": 0.42, + "learning_rate": 3.6347675995021874e-06, + "logits/chosen": -2.9346346855163574, + "logits/rejected": -2.7792932987213135, + "logps/chosen": -100.21730041503906, + "logps/rejected": -1037.0322265625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20675675570964813, + "rewards/margins": 9.714632987976074, + "rewards/rejected": -9.92138957977295, + "step": 6960 + }, + { + "epoch": 0.42, + "learning_rate": 3.6301283707167495e-06, + "logits/chosen": -2.933328151702881, + "logits/rejected": -2.767930269241333, + "logps/chosen": -101.89979553222656, + "logps/rejected": -1134.9237060546875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22748295962810516, + "rewards/margins": 10.678056716918945, + "rewards/rejected": -10.905540466308594, + "step": 6970 + }, + { + "epoch": 0.42, + "learning_rate": 3.6254842455388347e-06, + "logits/chosen": -2.892735004425049, + "logits/rejected": -2.7738585472106934, + "logps/chosen": -103.06649017333984, + "logps/rejected": -1066.89453125, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3658735156059265, + "rewards/margins": 9.865544319152832, + "rewards/rejected": -10.23141860961914, + "step": 6980 + }, + { + "epoch": 0.42, + "learning_rate": 3.6208352440895704e-06, + "logits/chosen": -2.9056954383850098, + "logits/rejected": -2.7622809410095215, + "logps/chosen": -122.41854095458984, + "logps/rejected": -997.4375, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4713757634162903, + "rewards/margins": 9.070648193359375, + "rewards/rejected": -9.542022705078125, + "step": 6990 + }, + { + "epoch": 0.42, + "learning_rate": 3.6161813865112155e-06, + "logits/chosen": -2.904318332672119, + "logits/rejected": -2.7899816036224365, + "logps/chosen": -101.73571014404297, + "logps/rejected": -986.7930908203125, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2600402235984802, + "rewards/margins": 9.165243148803711, + "rewards/rejected": -9.425283432006836, + "step": 7000 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -2.8776113986968994, + "eval_logits/rejected": -2.844184160232544, + "eval_logps/chosen": -175.428955078125, + "eval_logps/rejected": -904.81982421875, + "eval_loss": 0.01644059643149376, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -1.0835449695587158, + "eval_rewards/margins": 7.5091705322265625, + "eval_rewards/rejected": -8.592716217041016, + "eval_runtime": 4.2684, + "eval_samples_per_second": 1.171, + "eval_steps_per_second": 0.234, + "step": 7000 + }, + { + "epoch": 0.42, + "learning_rate": 3.611522692967065e-06, + "logits/chosen": -2.9096760749816895, + "logits/rejected": -2.750338077545166, + "logps/chosen": -110.2779312133789, + "logps/rejected": -1105.7159423828125, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3872075378894806, + "rewards/margins": 10.244175910949707, + "rewards/rejected": -10.631383895874023, + "step": 7010 + }, + { + "epoch": 0.42, + "learning_rate": 3.6068591836413687e-06, + "logits/chosen": -2.9231553077697754, + "logits/rejected": -2.7905192375183105, + "logps/chosen": -92.08689880371094, + "logps/rejected": -1078.547119140625, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2296150177717209, + "rewards/margins": 10.1162691116333, + "rewards/rejected": -10.3458833694458, + "step": 7020 + }, + { + "epoch": 0.42, + "learning_rate": 3.602190878739239e-06, + "logits/chosen": -2.900203227996826, + "logits/rejected": -2.785003662109375, + "logps/chosen": -101.54088592529297, + "logps/rejected": -1139.35986328125, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3198123276233673, + "rewards/margins": 10.610698699951172, + "rewards/rejected": -10.930512428283691, + "step": 7030 + }, + { + "epoch": 0.42, + "learning_rate": 3.5975177984865673e-06, + "logits/chosen": -2.878770589828491, + "logits/rejected": -2.751427173614502, + "logps/chosen": -99.40553283691406, + "logps/rejected": -1185.62158203125, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32205453515052795, + "rewards/margins": 11.092864990234375, + "rewards/rejected": -11.41491985321045, + "step": 7040 + }, + { + "epoch": 0.42, + "learning_rate": 3.592839963129934e-06, + "logits/chosen": -2.8878285884857178, + "logits/rejected": -2.7685837745666504, + "logps/chosen": -78.28253173828125, + "logps/rejected": -1149.0386962890625, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09539985656738281, + "rewards/margins": 10.969889640808105, + "rewards/rejected": -11.065289497375488, + "step": 7050 + }, + { + "epoch": 0.42, + "learning_rate": 3.588157392936521e-06, + "logits/chosen": -2.882582187652588, + "logits/rejected": -2.7486889362335205, + "logps/chosen": -83.16156005859375, + "logps/rejected": -1183.084228515625, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12763121724128723, + "rewards/margins": 11.256253242492676, + "rewards/rejected": -11.383883476257324, + "step": 7060 + }, + { + "epoch": 0.42, + "learning_rate": 3.583470108194026e-06, + "logits/chosen": -2.906589984893799, + "logits/rejected": -2.790468692779541, + "logps/chosen": -89.14583587646484, + "logps/rejected": -1030.7427978515625, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22094586491584778, + "rewards/margins": 9.643675804138184, + "rewards/rejected": -9.864623069763184, + "step": 7070 + }, + { + "epoch": 0.42, + "learning_rate": 3.5787781292105704e-06, + "logits/chosen": -2.920591354370117, + "logits/rejected": -2.8202481269836426, + "logps/chosen": -116.5506362915039, + "logps/rejected": -1035.1424560546875, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49446067214012146, + "rewards/margins": 9.41383171081543, + "rewards/rejected": -9.908292770385742, + "step": 7080 + }, + { + "epoch": 0.42, + "learning_rate": 3.5740814763146164e-06, + "logits/chosen": -2.874154567718506, + "logits/rejected": -2.7651419639587402, + "logps/chosen": -117.6974868774414, + "logps/rejected": -1153.4556884765625, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5130449533462524, + "rewards/margins": 10.582649230957031, + "rewards/rejected": -11.095693588256836, + "step": 7090 + }, + { + "epoch": 0.42, + "learning_rate": 3.569380169854875e-06, + "logits/chosen": -2.9505703449249268, + "logits/rejected": -2.8482229709625244, + "logps/chosen": -75.36229705810547, + "logps/rejected": -1178.416259765625, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09957019239664078, + "rewards/margins": 11.241961479187012, + "rewards/rejected": -11.341530799865723, + "step": 7100 + }, + { + "epoch": 0.42, + "learning_rate": 3.5646742302002185e-06, + "logits/chosen": -2.8947432041168213, + "logits/rejected": -2.8020567893981934, + "logps/chosen": -77.16735076904297, + "logps/rejected": -1093.35693359375, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07255446910858154, + "rewards/margins": 10.425664901733398, + "rewards/rejected": -10.498220443725586, + "step": 7110 + }, + { + "epoch": 0.42, + "learning_rate": 3.5599636777395954e-06, + "logits/chosen": -2.896845579147339, + "logits/rejected": -2.781343936920166, + "logps/chosen": -94.4598617553711, + "logps/rejected": -1132.225341796875, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19503115117549896, + "rewards/margins": 10.689114570617676, + "rewards/rejected": -10.884145736694336, + "step": 7120 + }, + { + "epoch": 0.43, + "learning_rate": 3.555248532881938e-06, + "logits/chosen": -2.8788840770721436, + "logits/rejected": -2.7696962356567383, + "logps/chosen": -147.68276977539062, + "logps/rejected": -1132.974853515625, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7828652262687683, + "rewards/margins": 10.097051620483398, + "rewards/rejected": -10.879919052124023, + "step": 7130 + }, + { + "epoch": 0.43, + "learning_rate": 3.5505288160560745e-06, + "logits/chosen": -2.9414455890655518, + "logits/rejected": -2.820356845855713, + "logps/chosen": -119.46009826660156, + "logps/rejected": -995.22802734375, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47757524251937866, + "rewards/margins": 9.035507202148438, + "rewards/rejected": -9.513082504272461, + "step": 7140 + }, + { + "epoch": 0.43, + "learning_rate": 3.545804547710645e-06, + "logits/chosen": -2.9364776611328125, + "logits/rejected": -2.788905620574951, + "logps/chosen": -121.44282531738281, + "logps/rejected": -1161.408935546875, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5759637951850891, + "rewards/margins": 10.596400260925293, + "rewards/rejected": -11.17236328125, + "step": 7150 + }, + { + "epoch": 0.43, + "learning_rate": 3.5410757483140057e-06, + "logits/chosen": -2.9435677528381348, + "logits/rejected": -2.796893835067749, + "logps/chosen": -120.52152252197266, + "logps/rejected": -1144.41796875, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5478697419166565, + "rewards/margins": 10.453125, + "rewards/rejected": -11.000993728637695, + "step": 7160 + }, + { + "epoch": 0.43, + "learning_rate": 3.5363424383541465e-06, + "logits/chosen": -2.9058589935302734, + "logits/rejected": -2.7687151432037354, + "logps/chosen": -200.76461791992188, + "logps/rejected": -1074.436279296875, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3432514667510986, + "rewards/margins": 8.966203689575195, + "rewards/rejected": -10.309454917907715, + "step": 7170 + }, + { + "epoch": 0.43, + "learning_rate": 3.5316046383385983e-06, + "logits/chosen": -2.912508487701416, + "logits/rejected": -2.7569141387939453, + "logps/chosen": -184.05824279785156, + "logps/rejected": -1251.82958984375, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1382877826690674, + "rewards/margins": 10.932048797607422, + "rewards/rejected": -12.07033634185791, + "step": 7180 + }, + { + "epoch": 0.43, + "learning_rate": 3.526862368794347e-06, + "logits/chosen": -2.9150776863098145, + "logits/rejected": -2.752119541168213, + "logps/chosen": -182.68524169921875, + "logps/rejected": -1092.622802734375, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1595345735549927, + "rewards/margins": 9.330678939819336, + "rewards/rejected": -10.490215301513672, + "step": 7190 + }, + { + "epoch": 0.43, + "learning_rate": 3.522115650267743e-06, + "logits/chosen": -2.9281692504882812, + "logits/rejected": -2.759819746017456, + "logps/chosen": -209.9885711669922, + "logps/rejected": -1307.581298828125, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3615130186080933, + "rewards/margins": 11.256918907165527, + "rewards/rejected": -12.618432998657227, + "step": 7200 + }, + { + "epoch": 0.43, + "learning_rate": 3.5173645033244103e-06, + "logits/chosen": -2.885080099105835, + "logits/rejected": -2.7192654609680176, + "logps/chosen": -159.7135009765625, + "logps/rejected": -1246.55517578125, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8966726064682007, + "rewards/margins": 11.121199607849121, + "rewards/rejected": -12.017870903015137, + "step": 7210 + }, + { + "epoch": 0.43, + "learning_rate": 3.5126089485491627e-06, + "logits/chosen": -2.915461540222168, + "logits/rejected": -2.760791540145874, + "logps/chosen": -126.8998031616211, + "logps/rejected": -1186.1251220703125, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6100717782974243, + "rewards/margins": 10.823419570922852, + "rewards/rejected": -11.433491706848145, + "step": 7220 + }, + { + "epoch": 0.43, + "learning_rate": 3.5078490065459083e-06, + "logits/chosen": -2.8814327716827393, + "logits/rejected": -2.754974365234375, + "logps/chosen": -118.06925964355469, + "logps/rejected": -1088.775390625, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49047431349754333, + "rewards/margins": 9.959485054016113, + "rewards/rejected": -10.449958801269531, + "step": 7230 + }, + { + "epoch": 0.43, + "learning_rate": 3.503084697937565e-06, + "logits/chosen": -2.9181699752807617, + "logits/rejected": -2.7804200649261475, + "logps/chosen": -135.76287841796875, + "logps/rejected": -1097.1099853515625, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.638733446598053, + "rewards/margins": 9.89672565460205, + "rewards/rejected": -10.5354585647583, + "step": 7240 + }, + { + "epoch": 0.43, + "learning_rate": 3.4983160433659702e-06, + "logits/chosen": -2.9229915142059326, + "logits/rejected": -2.7635908126831055, + "logps/chosen": -177.2243194580078, + "logps/rejected": -1138.9625244140625, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0620439052581787, + "rewards/margins": 9.895639419555664, + "rewards/rejected": -10.957681655883789, + "step": 7250 + }, + { + "epoch": 0.43, + "learning_rate": 3.493543063491788e-06, + "logits/chosen": -2.9145591259002686, + "logits/rejected": -2.793954372406006, + "logps/chosen": -214.75399780273438, + "logps/rejected": -1211.7982177734375, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3817886114120483, + "rewards/margins": 10.29120922088623, + "rewards/rejected": -11.67299747467041, + "step": 7260 + }, + { + "epoch": 0.43, + "learning_rate": 3.4887657789944236e-06, + "logits/chosen": -2.890446901321411, + "logits/rejected": -2.7624447345733643, + "logps/chosen": -140.09176635742188, + "logps/rejected": -1124.4049072265625, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6334695219993591, + "rewards/margins": 10.17092227935791, + "rewards/rejected": -10.804390907287598, + "step": 7270 + }, + { + "epoch": 0.43, + "learning_rate": 3.4839842105719346e-06, + "logits/chosen": -2.8583121299743652, + "logits/rejected": -2.751098394393921, + "logps/chosen": -113.97159576416016, + "logps/rejected": -1133.4715576171875, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.485659122467041, + "rewards/margins": 10.407791137695312, + "rewards/rejected": -10.893449783325195, + "step": 7280 + }, + { + "epoch": 0.43, + "learning_rate": 3.4791983789409358e-06, + "logits/chosen": -2.9343976974487305, + "logits/rejected": -2.760997772216797, + "logps/chosen": -107.3431396484375, + "logps/rejected": -1103.991455078125, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37820738554000854, + "rewards/margins": 10.233498573303223, + "rewards/rejected": -10.611705780029297, + "step": 7290 + }, + { + "epoch": 0.44, + "learning_rate": 3.474408304836514e-06, + "logits/chosen": -2.9446558952331543, + "logits/rejected": -2.805596113204956, + "logps/chosen": -117.64411926269531, + "logps/rejected": -1213.383544921875, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5462199449539185, + "rewards/margins": 11.134984970092773, + "rewards/rejected": -11.681203842163086, + "step": 7300 + }, + { + "epoch": 0.44, + "learning_rate": 3.4696140090121377e-06, + "logits/chosen": -2.836948871612549, + "logits/rejected": -2.7465267181396484, + "logps/chosen": -149.80337524414062, + "logps/rejected": -1234.909912109375, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.799278199672699, + "rewards/margins": 11.106013298034668, + "rewards/rejected": -11.905291557312012, + "step": 7310 + }, + { + "epoch": 0.44, + "learning_rate": 3.4648155122395653e-06, + "logits/chosen": -2.9179329872131348, + "logits/rejected": -2.801334857940674, + "logps/chosen": -131.99435424804688, + "logps/rejected": -1187.2066650390625, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6442035436630249, + "rewards/margins": 10.789766311645508, + "rewards/rejected": -11.433968544006348, + "step": 7320 + }, + { + "epoch": 0.44, + "learning_rate": 3.460012835308757e-06, + "logits/chosen": -2.9187004566192627, + "logits/rejected": -2.7918803691864014, + "logps/chosen": -121.123046875, + "logps/rejected": -1253.275634765625, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48308229446411133, + "rewards/margins": 11.605780601501465, + "rewards/rejected": -12.08886432647705, + "step": 7330 + }, + { + "epoch": 0.44, + "learning_rate": 3.455205999027783e-06, + "logits/chosen": -2.89402437210083, + "logits/rejected": -2.7125091552734375, + "logps/chosen": -166.7138214111328, + "logps/rejected": -1202.82568359375, + "loss": 0.0269, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9210103750228882, + "rewards/margins": 10.652724266052246, + "rewards/rejected": -11.573735237121582, + "step": 7340 + }, + { + "epoch": 0.44, + "learning_rate": 3.4503950242227356e-06, + "logits/chosen": -2.9318723678588867, + "logits/rejected": -2.7714152336120605, + "logps/chosen": -128.21913146972656, + "logps/rejected": -1293.4091796875, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6019225120544434, + "rewards/margins": 11.885282516479492, + "rewards/rejected": -12.487207412719727, + "step": 7350 + }, + { + "epoch": 0.44, + "learning_rate": 3.445579931737637e-06, + "logits/chosen": -2.8801767826080322, + "logits/rejected": -2.7383370399475098, + "logps/chosen": -130.3653106689453, + "logps/rejected": -1269.890869140625, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6593203544616699, + "rewards/margins": 11.590472221374512, + "rewards/rejected": -12.249794006347656, + "step": 7360 + }, + { + "epoch": 0.44, + "learning_rate": 3.44076074243435e-06, + "logits/chosen": -2.911398410797119, + "logits/rejected": -2.793224334716797, + "logps/chosen": -125.44395446777344, + "logps/rejected": -1229.047607421875, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5073223114013672, + "rewards/margins": 11.336065292358398, + "rewards/rejected": -11.84338665008545, + "step": 7370 + }, + { + "epoch": 0.44, + "learning_rate": 3.435937477192486e-06, + "logits/chosen": -2.8998169898986816, + "logits/rejected": -2.7710812091827393, + "logps/chosen": -100.43770599365234, + "logps/rejected": -1117.4185791015625, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33409082889556885, + "rewards/margins": 10.401369094848633, + "rewards/rejected": -10.735459327697754, + "step": 7380 + }, + { + "epoch": 0.44, + "learning_rate": 3.431110156909316e-06, + "logits/chosen": -2.914207696914673, + "logits/rejected": -2.7645771503448486, + "logps/chosen": -142.84832763671875, + "logps/rejected": -1104.692138671875, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7382567524909973, + "rewards/margins": 9.862664222717285, + "rewards/rejected": -10.600919723510742, + "step": 7390 + }, + { + "epoch": 0.44, + "learning_rate": 3.4262788024996835e-06, + "logits/chosen": -2.927865505218506, + "logits/rejected": -2.7974324226379395, + "logps/chosen": -92.3178482055664, + "logps/rejected": -1177.158447265625, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20079433917999268, + "rewards/margins": 11.131086349487305, + "rewards/rejected": -11.331879615783691, + "step": 7400 + }, + { + "epoch": 0.44, + "learning_rate": 3.421443434895905e-06, + "logits/chosen": -2.906113386154175, + "logits/rejected": -2.8002967834472656, + "logps/chosen": -92.94039916992188, + "logps/rejected": -1144.2978515625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19046145677566528, + "rewards/margins": 10.805360794067383, + "rewards/rejected": -10.995823860168457, + "step": 7410 + }, + { + "epoch": 0.44, + "learning_rate": 3.4166040750476868e-06, + "logits/chosen": -2.8886752128601074, + "logits/rejected": -2.7687957286834717, + "logps/chosen": -95.22380065917969, + "logps/rejected": -1039.466796875, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2493990659713745, + "rewards/margins": 9.703400611877441, + "rewards/rejected": -9.952799797058105, + "step": 7420 + }, + { + "epoch": 0.44, + "learning_rate": 3.4117607439220336e-06, + "logits/chosen": -2.9080374240875244, + "logits/rejected": -2.810459613800049, + "logps/chosen": -81.8446044921875, + "logps/rejected": -1124.9327392578125, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12409061193466187, + "rewards/margins": 10.675195693969727, + "rewards/rejected": -10.799286842346191, + "step": 7430 + }, + { + "epoch": 0.44, + "learning_rate": 3.406913462503153e-06, + "logits/chosen": -2.9194133281707764, + "logits/rejected": -2.79185152053833, + "logps/chosen": -85.29582214355469, + "logps/rejected": -1234.4886474609375, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11951877921819687, + "rewards/margins": 11.769564628601074, + "rewards/rejected": -11.889083862304688, + "step": 7440 + }, + { + "epoch": 0.44, + "learning_rate": 3.40206225179237e-06, + "logits/chosen": -2.8995308876037598, + "logits/rejected": -2.7470388412475586, + "logps/chosen": -84.77726745605469, + "logps/rejected": -1044.743896484375, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17494158446788788, + "rewards/margins": 9.827191352844238, + "rewards/rejected": -10.002132415771484, + "step": 7450 + }, + { + "epoch": 0.44, + "learning_rate": 3.397207132808033e-06, + "logits/chosen": -2.8914952278137207, + "logits/rejected": -2.7760746479034424, + "logps/chosen": -117.90235900878906, + "logps/rejected": -1162.0018310546875, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46003374457359314, + "rewards/margins": 10.728364944458008, + "rewards/rejected": -11.188400268554688, + "step": 7460 + }, + { + "epoch": 0.45, + "learning_rate": 3.3923481265854226e-06, + "logits/chosen": -2.924891948699951, + "logits/rejected": -2.7873799800872803, + "logps/chosen": -138.79653930664062, + "logps/rejected": -1196.8026123046875, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6437373161315918, + "rewards/margins": 10.888925552368164, + "rewards/rejected": -11.532663345336914, + "step": 7470 + }, + { + "epoch": 0.45, + "learning_rate": 3.387485254176663e-06, + "logits/chosen": -2.909339427947998, + "logits/rejected": -2.757549524307251, + "logps/chosen": -136.43145751953125, + "logps/rejected": -1189.8463134765625, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6500800251960754, + "rewards/margins": 10.8010835647583, + "rewards/rejected": -11.451164245605469, + "step": 7480 + }, + { + "epoch": 0.45, + "learning_rate": 3.382618536650626e-06, + "logits/chosen": -2.9054527282714844, + "logits/rejected": -2.8070077896118164, + "logps/chosen": -149.27891540527344, + "logps/rejected": -1211.5892333984375, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7684259414672852, + "rewards/margins": 10.909029006958008, + "rewards/rejected": -11.677453994750977, + "step": 7490 + }, + { + "epoch": 0.45, + "learning_rate": 3.377747995092846e-06, + "logits/chosen": -2.908186435699463, + "logits/rejected": -2.747255802154541, + "logps/chosen": -91.23493957519531, + "logps/rejected": -1106.392333984375, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24066027998924255, + "rewards/margins": 10.382299423217773, + "rewards/rejected": -10.622961044311523, + "step": 7500 + }, + { + "epoch": 0.45, + "learning_rate": 3.3728736506054234e-06, + "logits/chosen": -2.862262010574341, + "logits/rejected": -2.7661473751068115, + "logps/chosen": -92.10249328613281, + "logps/rejected": -1219.478271484375, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.196889728307724, + "rewards/margins": 11.551292419433594, + "rewards/rejected": -11.74818229675293, + "step": 7510 + }, + { + "epoch": 0.45, + "learning_rate": 3.3679955243069364e-06, + "logits/chosen": -2.9217212200164795, + "logits/rejected": -2.797945022583008, + "logps/chosen": -90.7013931274414, + "logps/rejected": -1055.8570556640625, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25004202127456665, + "rewards/margins": 9.877058029174805, + "rewards/rejected": -10.127099990844727, + "step": 7520 + }, + { + "epoch": 0.45, + "learning_rate": 3.3631136373323468e-06, + "logits/chosen": -2.908334732055664, + "logits/rejected": -2.785301685333252, + "logps/chosen": -108.50828552246094, + "logps/rejected": -1166.8721923828125, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35020893812179565, + "rewards/margins": 10.8536958694458, + "rewards/rejected": -11.20390510559082, + "step": 7530 + }, + { + "epoch": 0.45, + "learning_rate": 3.3582280108329125e-06, + "logits/chosen": -2.9232125282287598, + "logits/rejected": -2.776710033416748, + "logps/chosen": -103.93333435058594, + "logps/rejected": -1093.431396484375, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.315208375453949, + "rewards/margins": 10.189821243286133, + "rewards/rejected": -10.505029678344727, + "step": 7540 + }, + { + "epoch": 0.45, + "learning_rate": 3.353338665976089e-06, + "logits/chosen": -2.903386354446411, + "logits/rejected": -2.7691469192504883, + "logps/chosen": -126.32470703125, + "logps/rejected": -1136.219970703125, + "loss": 0.0338, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5213348865509033, + "rewards/margins": 10.39504337310791, + "rewards/rejected": -10.916378021240234, + "step": 7550 + }, + { + "epoch": 0.45, + "learning_rate": 3.3484456239454467e-06, + "logits/chosen": -2.8897786140441895, + "logits/rejected": -2.7951836585998535, + "logps/chosen": -190.11044311523438, + "logps/rejected": -1097.8270263671875, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2235487699508667, + "rewards/margins": 9.306633949279785, + "rewards/rejected": -10.530183792114258, + "step": 7560 + }, + { + "epoch": 0.45, + "learning_rate": 3.3435489059405713e-06, + "logits/chosen": -2.9078426361083984, + "logits/rejected": -2.799065113067627, + "logps/chosen": -112.38739013671875, + "logps/rejected": -1027.890869140625, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4134586453437805, + "rewards/margins": 9.435567855834961, + "rewards/rejected": -9.849026679992676, + "step": 7570 + }, + { + "epoch": 0.45, + "learning_rate": 3.3386485331769747e-06, + "logits/chosen": -2.8979456424713135, + "logits/rejected": -2.775184154510498, + "logps/chosen": -78.13394927978516, + "logps/rejected": -1113.0523681640625, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.061250101774930954, + "rewards/margins": 10.617616653442383, + "rewards/rejected": -10.678865432739258, + "step": 7580 + }, + { + "epoch": 0.45, + "learning_rate": 3.3337445268860065e-06, + "logits/chosen": -2.9517388343811035, + "logits/rejected": -2.8235983848571777, + "logps/chosen": -77.43302154541016, + "logps/rejected": -1103.814208984375, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04222659394145012, + "rewards/margins": 10.552355766296387, + "rewards/rejected": -10.594582557678223, + "step": 7590 + }, + { + "epoch": 0.45, + "learning_rate": 3.328836908314755e-06, + "logits/chosen": -2.926504373550415, + "logits/rejected": -2.787956714630127, + "logps/chosen": -64.11087799072266, + "logps/rejected": -1041.532470703125, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04303840547800064, + "rewards/margins": 10.002165794372559, + "rewards/rejected": -9.959126472473145, + "step": 7600 + }, + { + "epoch": 0.45, + "learning_rate": 3.3239256987259635e-06, + "logits/chosen": -2.896299362182617, + "logits/rejected": -2.827054500579834, + "logps/chosen": -77.9505615234375, + "logps/rejected": -966.1187744140625, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08739982545375824, + "rewards/margins": 9.138105392456055, + "rewards/rejected": -9.225504875183105, + "step": 7610 + }, + { + "epoch": 0.45, + "learning_rate": 3.319010919397929e-06, + "logits/chosen": -2.921833038330078, + "logits/rejected": -2.8029959201812744, + "logps/chosen": -132.29855346679688, + "logps/rejected": -1028.130615234375, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5989159345626831, + "rewards/margins": 9.243756294250488, + "rewards/rejected": -9.842672348022461, + "step": 7620 + }, + { + "epoch": 0.45, + "learning_rate": 3.3140925916244184e-06, + "logits/chosen": -2.93902587890625, + "logits/rejected": -2.7790448665618896, + "logps/chosen": -107.2216567993164, + "logps/rejected": -1134.238037109375, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3338576555252075, + "rewards/margins": 10.558181762695312, + "rewards/rejected": -10.892040252685547, + "step": 7630 + }, + { + "epoch": 0.46, + "learning_rate": 3.3091707367145707e-06, + "logits/chosen": -2.867914915084839, + "logits/rejected": -2.764857530593872, + "logps/chosen": -105.13523864746094, + "logps/rejected": -1192.6383056640625, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3823467791080475, + "rewards/margins": 11.101738929748535, + "rewards/rejected": -11.484085083007812, + "step": 7640 + }, + { + "epoch": 0.46, + "learning_rate": 3.304245375992807e-06, + "logits/chosen": -2.9051058292388916, + "logits/rejected": -2.7789852619171143, + "logps/chosen": -179.8166046142578, + "logps/rejected": -1130.5843505859375, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.168277621269226, + "rewards/margins": 9.694743156433105, + "rewards/rejected": -10.863021850585938, + "step": 7650 + }, + { + "epoch": 0.46, + "learning_rate": 3.299316530798738e-06, + "logits/chosen": -2.89345121383667, + "logits/rejected": -2.7720870971679688, + "logps/chosen": -194.74624633789062, + "logps/rejected": -1303.444580078125, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2065147161483765, + "rewards/margins": 11.379701614379883, + "rewards/rejected": -12.586216926574707, + "step": 7660 + }, + { + "epoch": 0.46, + "learning_rate": 3.2943842224870705e-06, + "logits/chosen": -2.884500026702881, + "logits/rejected": -2.7564034461975098, + "logps/chosen": -132.61166381835938, + "logps/rejected": -1193.4124755859375, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5704704523086548, + "rewards/margins": 10.92011833190918, + "rewards/rejected": -11.490588188171387, + "step": 7670 + }, + { + "epoch": 0.46, + "learning_rate": 3.2894484724275156e-06, + "logits/chosen": -2.914498805999756, + "logits/rejected": -2.7918460369110107, + "logps/chosen": -97.73077392578125, + "logps/rejected": -1090.3912353515625, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29716259241104126, + "rewards/margins": 10.16445541381836, + "rewards/rejected": -10.461617469787598, + "step": 7680 + }, + { + "epoch": 0.46, + "learning_rate": 3.284509302004699e-06, + "logits/chosen": -2.8862226009368896, + "logits/rejected": -2.7714197635650635, + "logps/chosen": -108.28077697753906, + "logps/rejected": -1223.725341796875, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3438900113105774, + "rewards/margins": 11.444146156311035, + "rewards/rejected": -11.788037300109863, + "step": 7690 + }, + { + "epoch": 0.46, + "learning_rate": 3.2795667326180604e-06, + "logits/chosen": -2.8831708431243896, + "logits/rejected": -2.812370777130127, + "logps/chosen": -107.17352294921875, + "logps/rejected": -1172.658447265625, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38246119022369385, + "rewards/margins": 10.899229049682617, + "rewards/rejected": -11.28169059753418, + "step": 7700 + }, + { + "epoch": 0.46, + "learning_rate": 3.2746207856817695e-06, + "logits/chosen": -2.8924126625061035, + "logits/rejected": -2.80979585647583, + "logps/chosen": -142.9319305419922, + "logps/rejected": -1171.10693359375, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7194041609764099, + "rewards/margins": 10.549975395202637, + "rewards/rejected": -11.269380569458008, + "step": 7710 + }, + { + "epoch": 0.46, + "learning_rate": 3.2696714826246295e-06, + "logits/chosen": -2.893454074859619, + "logits/rejected": -2.7888636589050293, + "logps/chosen": -119.1712417602539, + "logps/rejected": -1150.808837890625, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.468590646982193, + "rewards/margins": 10.587118148803711, + "rewards/rejected": -11.055707931518555, + "step": 7720 + }, + { + "epoch": 0.46, + "learning_rate": 3.2647188448899813e-06, + "logits/chosen": -2.942380428314209, + "logits/rejected": -2.8053178787231445, + "logps/chosen": -104.38326263427734, + "logps/rejected": -1062.9024658203125, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2865389883518219, + "rewards/margins": 9.900952339172363, + "rewards/rejected": -10.187492370605469, + "step": 7730 + }, + { + "epoch": 0.46, + "learning_rate": 3.2597628939356174e-06, + "logits/chosen": -2.8925106525421143, + "logits/rejected": -2.796748161315918, + "logps/chosen": -102.07750701904297, + "logps/rejected": -1070.4822998046875, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34492790699005127, + "rewards/margins": 9.899534225463867, + "rewards/rejected": -10.244461059570312, + "step": 7740 + }, + { + "epoch": 0.46, + "learning_rate": 3.254803651233683e-06, + "logits/chosen": -2.8944544792175293, + "logits/rejected": -2.7875208854675293, + "logps/chosen": -130.02359008789062, + "logps/rejected": -1073.03173828125, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5804868340492249, + "rewards/margins": 9.703164100646973, + "rewards/rejected": -10.283651351928711, + "step": 7750 + }, + { + "epoch": 0.46, + "learning_rate": 3.249841138270585e-06, + "logits/chosen": -2.924694538116455, + "logits/rejected": -2.7893524169921875, + "logps/chosen": -89.68095397949219, + "logps/rejected": -1207.548828125, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21344156563282013, + "rewards/margins": 11.428121566772461, + "rewards/rejected": -11.641563415527344, + "step": 7760 + }, + { + "epoch": 0.46, + "learning_rate": 3.2448753765469e-06, + "logits/chosen": -2.8864364624023438, + "logits/rejected": -2.811795234680176, + "logps/chosen": -89.72996520996094, + "logps/rejected": -1198.156494140625, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20445597171783447, + "rewards/margins": 11.331620216369629, + "rewards/rejected": -11.536075592041016, + "step": 7770 + }, + { + "epoch": 0.46, + "learning_rate": 3.23990638757728e-06, + "logits/chosen": -2.9429805278778076, + "logits/rejected": -2.795192003250122, + "logps/chosen": -131.03366088867188, + "logps/rejected": -1132.643310546875, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.584152340888977, + "rewards/margins": 10.295206069946289, + "rewards/rejected": -10.879358291625977, + "step": 7780 + }, + { + "epoch": 0.46, + "learning_rate": 3.2349341928903588e-06, + "logits/chosen": -2.907038927078247, + "logits/rejected": -2.77135968208313, + "logps/chosen": -92.76820373535156, + "logps/rejected": -1118.838134765625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24288122355937958, + "rewards/margins": 10.516764640808105, + "rewards/rejected": -10.75964641571045, + "step": 7790 + }, + { + "epoch": 0.47, + "learning_rate": 3.2299588140286597e-06, + "logits/chosen": -2.8822779655456543, + "logits/rejected": -2.7765870094299316, + "logps/chosen": -121.6458969116211, + "logps/rejected": -1159.571533203125, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.518525242805481, + "rewards/margins": 10.626665115356445, + "rewards/rejected": -11.145190238952637, + "step": 7800 + }, + { + "epoch": 0.47, + "learning_rate": 3.2249802725485026e-06, + "logits/chosen": -2.8916025161743164, + "logits/rejected": -2.7779576778411865, + "logps/chosen": -95.29065704345703, + "logps/rejected": -948.3624267578125, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25884923338890076, + "rewards/margins": 8.782289505004883, + "rewards/rejected": -9.0411376953125, + "step": 7810 + }, + { + "epoch": 0.47, + "learning_rate": 3.2199985900199064e-06, + "logits/chosen": -2.906839609146118, + "logits/rejected": -2.773341417312622, + "logps/chosen": -89.33119201660156, + "logps/rejected": -1099.8743896484375, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19501051306724548, + "rewards/margins": 10.363716125488281, + "rewards/rejected": -10.558727264404297, + "step": 7820 + }, + { + "epoch": 0.47, + "learning_rate": 3.215013788026504e-06, + "logits/chosen": -2.9322872161865234, + "logits/rejected": -2.7878661155700684, + "logps/chosen": -117.50235748291016, + "logps/rejected": -1091.5369873046875, + "loss": 0.0222, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4392385482788086, + "rewards/margins": 10.031774520874023, + "rewards/rejected": -10.471014022827148, + "step": 7830 + }, + { + "epoch": 0.47, + "learning_rate": 3.2100258881654387e-06, + "logits/chosen": -2.8693737983703613, + "logits/rejected": -2.759340524673462, + "logps/chosen": -98.27899169921875, + "logps/rejected": -973.5848388671875, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24930746853351593, + "rewards/margins": 9.050195693969727, + "rewards/rejected": -9.299501419067383, + "step": 7840 + }, + { + "epoch": 0.47, + "learning_rate": 3.20503491204728e-06, + "logits/chosen": -2.8951995372772217, + "logits/rejected": -2.7511303424835205, + "logps/chosen": -93.40910339355469, + "logps/rejected": -1145.65576171875, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17460860311985016, + "rewards/margins": 10.842793464660645, + "rewards/rejected": -11.017400741577148, + "step": 7850 + }, + { + "epoch": 0.47, + "learning_rate": 3.200040881295922e-06, + "logits/chosen": -2.8989176750183105, + "logits/rejected": -2.766719102859497, + "logps/chosen": -92.99183654785156, + "logps/rejected": -1149.91748046875, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2718045711517334, + "rewards/margins": 10.780914306640625, + "rewards/rejected": -11.052719116210938, + "step": 7860 + }, + { + "epoch": 0.47, + "learning_rate": 3.1950438175484965e-06, + "logits/chosen": -2.9045536518096924, + "logits/rejected": -2.7655181884765625, + "logps/chosen": -90.64842224121094, + "logps/rejected": -1172.5733642578125, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18401357531547546, + "rewards/margins": 11.10698127746582, + "rewards/rejected": -11.290994644165039, + "step": 7870 + }, + { + "epoch": 0.47, + "learning_rate": 3.1900437424552726e-06, + "logits/chosen": -2.9216113090515137, + "logits/rejected": -2.7956647872924805, + "logps/chosen": -72.9948501586914, + "logps/rejected": -1124.760498046875, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0612676739692688, + "rewards/margins": 10.752792358398438, + "rewards/rejected": -10.81406021118164, + "step": 7880 + }, + { + "epoch": 0.47, + "learning_rate": 3.1850406776795682e-06, + "logits/chosen": -2.9127840995788574, + "logits/rejected": -2.771578311920166, + "logps/chosen": -76.86058807373047, + "logps/rejected": -1020.6849365234375, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08427444845438004, + "rewards/margins": 9.686493873596191, + "rewards/rejected": -9.770768165588379, + "step": 7890 + }, + { + "epoch": 0.47, + "learning_rate": 3.1800346448976567e-06, + "logits/chosen": -2.905949592590332, + "logits/rejected": -2.7715139389038086, + "logps/chosen": -107.0735092163086, + "logps/rejected": -1213.9295654296875, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3573954999446869, + "rewards/margins": 11.336355209350586, + "rewards/rejected": -11.693750381469727, + "step": 7900 + }, + { + "epoch": 0.47, + "learning_rate": 3.1750256657986643e-06, + "logits/chosen": -2.9175524711608887, + "logits/rejected": -2.772958278656006, + "logps/chosen": -228.7196807861328, + "logps/rejected": -1282.1923828125, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5395708084106445, + "rewards/margins": 10.842707633972168, + "rewards/rejected": -12.382279396057129, + "step": 7910 + }, + { + "epoch": 0.47, + "learning_rate": 3.1700137620844897e-06, + "logits/chosen": -2.899862289428711, + "logits/rejected": -2.769911527633667, + "logps/chosen": -236.7340087890625, + "logps/rejected": -1178.8040771484375, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7074544429779053, + "rewards/margins": 9.635464668273926, + "rewards/rejected": -11.342917442321777, + "step": 7920 + }, + { + "epoch": 0.47, + "learning_rate": 3.164998955469697e-06, + "logits/chosen": -2.9402148723602295, + "logits/rejected": -2.782379627227783, + "logps/chosen": -160.73495483398438, + "logps/rejected": -1204.7757568359375, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9023953676223755, + "rewards/margins": 10.697343826293945, + "rewards/rejected": -11.599738121032715, + "step": 7930 + }, + { + "epoch": 0.47, + "learning_rate": 3.1599812676814314e-06, + "logits/chosen": -2.9018945693969727, + "logits/rejected": -2.7919204235076904, + "logps/chosen": -119.3823013305664, + "logps/rejected": -1185.866943359375, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5544542670249939, + "rewards/margins": 10.875728607177734, + "rewards/rejected": -11.430182456970215, + "step": 7940 + }, + { + "epoch": 0.47, + "learning_rate": 3.1549607204593185e-06, + "logits/chosen": -2.894233226776123, + "logits/rejected": -2.7141265869140625, + "logps/chosen": -135.49066162109375, + "logps/rejected": -1150.4248046875, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.696251392364502, + "rewards/margins": 10.371603012084961, + "rewards/rejected": -11.067853927612305, + "step": 7950 + }, + { + "epoch": 0.47, + "learning_rate": 3.1499373355553746e-06, + "logits/chosen": -2.8979289531707764, + "logits/rejected": -2.793078899383545, + "logps/chosen": -136.1551513671875, + "logps/rejected": -1105.3221435546875, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6886736154556274, + "rewards/margins": 9.918999671936035, + "rewards/rejected": -10.607671737670898, + "step": 7960 + }, + { + "epoch": 0.48, + "learning_rate": 3.1449111347339084e-06, + "logits/chosen": -2.911642074584961, + "logits/rejected": -2.781906843185425, + "logps/chosen": -103.6140365600586, + "logps/rejected": -1114.9400634765625, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3321247696876526, + "rewards/margins": 10.374422073364258, + "rewards/rejected": -10.70654582977295, + "step": 7970 + }, + { + "epoch": 0.48, + "learning_rate": 3.139882139771431e-06, + "logits/chosen": -2.9380996227264404, + "logits/rejected": -2.7687249183654785, + "logps/chosen": -132.02896118164062, + "logps/rejected": -1165.959228515625, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5754782557487488, + "rewards/margins": 10.640026092529297, + "rewards/rejected": -11.215505599975586, + "step": 7980 + }, + { + "epoch": 0.48, + "learning_rate": 3.134850372456558e-06, + "logits/chosen": -2.89884877204895, + "logits/rejected": -2.799994945526123, + "logps/chosen": -117.5091323852539, + "logps/rejected": -1086.375244140625, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4332999289035797, + "rewards/margins": 9.98084831237793, + "rewards/rejected": -10.414148330688477, + "step": 7990 + }, + { + "epoch": 0.48, + "learning_rate": 3.1298158545899167e-06, + "logits/chosen": -2.9007787704467773, + "logits/rejected": -2.8000261783599854, + "logps/chosen": -132.62811279296875, + "logps/rejected": -1202.259033203125, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6295466423034668, + "rewards/margins": 10.952387809753418, + "rewards/rejected": -11.581934928894043, + "step": 8000 + }, + { + "epoch": 0.48, + "learning_rate": 3.1247786079840513e-06, + "logits/chosen": -2.9133951663970947, + "logits/rejected": -2.7975242137908936, + "logps/chosen": -154.61904907226562, + "logps/rejected": -1088.6337890625, + "loss": 0.1259, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8594759702682495, + "rewards/margins": 9.582804679870605, + "rewards/rejected": -10.442280769348145, + "step": 8010 + }, + { + "epoch": 0.48, + "learning_rate": 3.11973865446333e-06, + "logits/chosen": -2.917457342147827, + "logits/rejected": -2.787106513977051, + "logps/chosen": -139.6363983154297, + "logps/rejected": -1186.9237060546875, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7446572780609131, + "rewards/margins": 10.687456130981445, + "rewards/rejected": -11.432111740112305, + "step": 8020 + }, + { + "epoch": 0.48, + "learning_rate": 3.1146960158638475e-06, + "logits/chosen": -2.876384735107422, + "logits/rejected": -2.716489315032959, + "logps/chosen": -111.98118591308594, + "logps/rejected": -1249.654052734375, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47215795516967773, + "rewards/margins": 11.570295333862305, + "rewards/rejected": -12.04245376586914, + "step": 8030 + }, + { + "epoch": 0.48, + "learning_rate": 3.109650714033331e-06, + "logits/chosen": -2.9529457092285156, + "logits/rejected": -2.8049941062927246, + "logps/chosen": -103.19281005859375, + "logps/rejected": -1153.6995849609375, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3436373174190521, + "rewards/margins": 10.744990348815918, + "rewards/rejected": -11.088627815246582, + "step": 8040 + }, + { + "epoch": 0.48, + "learning_rate": 3.10460277083105e-06, + "logits/chosen": -2.9112133979797363, + "logits/rejected": -2.773751974105835, + "logps/chosen": -108.43266296386719, + "logps/rejected": -1187.142333984375, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4154531955718994, + "rewards/margins": 11.014824867248535, + "rewards/rejected": -11.430275917053223, + "step": 8050 + }, + { + "epoch": 0.48, + "learning_rate": 3.099552208127713e-06, + "logits/chosen": -2.921924114227295, + "logits/rejected": -2.791663646697998, + "logps/chosen": -103.28163146972656, + "logps/rejected": -1102.4859619140625, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34807831048965454, + "rewards/margins": 10.234344482421875, + "rewards/rejected": -10.582422256469727, + "step": 8060 + }, + { + "epoch": 0.48, + "learning_rate": 3.0944990478053816e-06, + "logits/chosen": -2.8961918354034424, + "logits/rejected": -2.774545192718506, + "logps/chosen": -134.37338256835938, + "logps/rejected": -1182.175048828125, + "loss": 0.0196, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6099185347557068, + "rewards/margins": 10.75146198272705, + "rewards/rejected": -11.361379623413086, + "step": 8070 + }, + { + "epoch": 0.48, + "learning_rate": 3.089443311757371e-06, + "logits/chosen": -2.916269540786743, + "logits/rejected": -2.7896814346313477, + "logps/chosen": -100.31048583984375, + "logps/rejected": -1159.375244140625, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3261149823665619, + "rewards/margins": 10.81511116027832, + "rewards/rejected": -11.141225814819336, + "step": 8080 + }, + { + "epoch": 0.48, + "learning_rate": 3.0843850218881545e-06, + "logits/chosen": -2.9256129264831543, + "logits/rejected": -2.7803902626037598, + "logps/chosen": -99.23483276367188, + "logps/rejected": -1147.093505859375, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.243240088224411, + "rewards/margins": 10.786042213439941, + "rewards/rejected": -11.029282569885254, + "step": 8090 + }, + { + "epoch": 0.48, + "learning_rate": 3.0793242001132725e-06, + "logits/chosen": -2.9398560523986816, + "logits/rejected": -2.758514881134033, + "logps/chosen": -126.73808288574219, + "logps/rejected": -1083.206787109375, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5989679098129272, + "rewards/margins": 9.79456615447998, + "rewards/rejected": -10.393533706665039, + "step": 8100 + }, + { + "epoch": 0.48, + "learning_rate": 3.074260868359233e-06, + "logits/chosen": -2.9151105880737305, + "logits/rejected": -2.7593088150024414, + "logps/chosen": -84.69123840332031, + "logps/rejected": -1154.373291015625, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15928702056407928, + "rewards/margins": 10.934359550476074, + "rewards/rejected": -11.093647956848145, + "step": 8110 + }, + { + "epoch": 0.48, + "learning_rate": 3.0691950485634192e-06, + "logits/chosen": -2.9352869987487793, + "logits/rejected": -2.7683749198913574, + "logps/chosen": -101.80007934570312, + "logps/rejected": -1129.234130859375, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35052749514579773, + "rewards/margins": 10.501663208007812, + "rewards/rejected": -10.852190017700195, + "step": 8120 + }, + { + "epoch": 0.48, + "learning_rate": 3.0641267626739946e-06, + "logits/chosen": -2.946281671524048, + "logits/rejected": -2.8257334232330322, + "logps/chosen": -93.11467742919922, + "logps/rejected": -1135.157470703125, + "loss": 0.0287, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.26752060651779175, + "rewards/margins": 10.6437406539917, + "rewards/rejected": -10.911263465881348, + "step": 8130 + }, + { + "epoch": 0.49, + "learning_rate": 3.059056032649808e-06, + "logits/chosen": -2.876255512237549, + "logits/rejected": -2.770972728729248, + "logps/chosen": -73.00826263427734, + "logps/rejected": -1075.5638427734375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11690004169940948, + "rewards/margins": 10.196069717407227, + "rewards/rejected": -10.312968254089355, + "step": 8140 + }, + { + "epoch": 0.49, + "learning_rate": 3.0539828804602955e-06, + "logits/chosen": -2.9464824199676514, + "logits/rejected": -2.790735960006714, + "logps/chosen": -74.12883758544922, + "logps/rejected": -1151.718505859375, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01673782430589199, + "rewards/margins": 11.099535942077637, + "rewards/rejected": -11.082798957824707, + "step": 8150 + }, + { + "epoch": 0.49, + "learning_rate": 3.0489073280853886e-06, + "logits/chosen": -2.925861358642578, + "logits/rejected": -2.8040521144866943, + "logps/chosen": -71.85643005371094, + "logps/rejected": -1043.108154296875, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0238981693983078, + "rewards/margins": 10.012449264526367, + "rewards/rejected": -9.988550186157227, + "step": 8160 + }, + { + "epoch": 0.49, + "learning_rate": 3.043829397515419e-06, + "logits/chosen": -2.894137144088745, + "logits/rejected": -2.7600536346435547, + "logps/chosen": -73.14070129394531, + "logps/rejected": -1124.641357421875, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013810524716973305, + "rewards/margins": 10.816551208496094, + "rewards/rejected": -10.802742004394531, + "step": 8170 + }, + { + "epoch": 0.49, + "learning_rate": 3.03874911075102e-06, + "logits/chosen": -2.919699192047119, + "logits/rejected": -2.7573540210723877, + "logps/chosen": -72.86320495605469, + "logps/rejected": -1086.99951171875, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016040001064538956, + "rewards/margins": 10.398584365844727, + "rewards/rejected": -10.414624214172363, + "step": 8180 + }, + { + "epoch": 0.49, + "learning_rate": 3.0336664898030344e-06, + "logits/chosen": -2.922598123550415, + "logits/rejected": -2.8029847145080566, + "logps/chosen": -79.62333679199219, + "logps/rejected": -1090.1444091796875, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06879739463329315, + "rewards/margins": 10.383159637451172, + "rewards/rejected": -10.451955795288086, + "step": 8190 + }, + { + "epoch": 0.49, + "learning_rate": 3.0285815566924186e-06, + "logits/chosen": -2.932765007019043, + "logits/rejected": -2.790095806121826, + "logps/chosen": -84.5161361694336, + "logps/rejected": -1213.811767578125, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15413479506969452, + "rewards/margins": 11.52294635772705, + "rewards/rejected": -11.677081108093262, + "step": 8200 + }, + { + "epoch": 0.49, + "learning_rate": 3.023494333450146e-06, + "logits/chosen": -2.878129482269287, + "logits/rejected": -2.746980667114258, + "logps/chosen": -84.72652435302734, + "logps/rejected": -1167.591552734375, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17867842316627502, + "rewards/margins": 11.049379348754883, + "rewards/rejected": -11.228056907653809, + "step": 8210 + }, + { + "epoch": 0.49, + "learning_rate": 3.018404842117112e-06, + "logits/chosen": -2.9170081615448, + "logits/rejected": -2.777595281600952, + "logps/chosen": -87.08171081542969, + "logps/rejected": -967.5809326171875, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15085728466510773, + "rewards/margins": 9.075668334960938, + "rewards/rejected": -9.226526260375977, + "step": 8220 + }, + { + "epoch": 0.49, + "learning_rate": 3.01331310474404e-06, + "logits/chosen": -2.9363760948181152, + "logits/rejected": -2.793762683868408, + "logps/chosen": -89.48347473144531, + "logps/rejected": -1008.2911376953125, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19423620402812958, + "rewards/margins": 9.455721855163574, + "rewards/rejected": -9.649958610534668, + "step": 8230 + }, + { + "epoch": 0.49, + "learning_rate": 3.0082191433913825e-06, + "logits/chosen": -2.954378366470337, + "logits/rejected": -2.7722232341766357, + "logps/chosen": -78.72822570800781, + "logps/rejected": -1160.857666015625, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10182075202465057, + "rewards/margins": 11.069437980651855, + "rewards/rejected": -11.171258926391602, + "step": 8240 + }, + { + "epoch": 0.49, + "learning_rate": 3.0031229801292293e-06, + "logits/chosen": -2.9497978687286377, + "logits/rejected": -2.7920262813568115, + "logps/chosen": -79.68630981445312, + "logps/rejected": -1183.924072265625, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12509101629257202, + "rewards/margins": 11.281144142150879, + "rewards/rejected": -11.406235694885254, + "step": 8250 + }, + { + "epoch": 0.49, + "learning_rate": 2.99802463703721e-06, + "logits/chosen": -2.9280426502227783, + "logits/rejected": -2.80010986328125, + "logps/chosen": -77.04187774658203, + "logps/rejected": -1131.11962890625, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.041668131947517395, + "rewards/margins": 10.820841789245605, + "rewards/rejected": -10.86251163482666, + "step": 8260 + }, + { + "epoch": 0.49, + "learning_rate": 2.9929241362043976e-06, + "logits/chosen": -2.9086289405822754, + "logits/rejected": -2.7568442821502686, + "logps/chosen": -81.07769775390625, + "logps/rejected": -1087.364013671875, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16367505490779877, + "rewards/margins": 10.270933151245117, + "rewards/rejected": -10.434608459472656, + "step": 8270 + }, + { + "epoch": 0.49, + "learning_rate": 2.9878214997292155e-06, + "logits/chosen": -2.9421534538269043, + "logits/rejected": -2.7460548877716064, + "logps/chosen": -95.58737182617188, + "logps/rejected": -1106.6307373046875, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2242475003004074, + "rewards/margins": 10.409322738647461, + "rewards/rejected": -10.633569717407227, + "step": 8280 + }, + { + "epoch": 0.49, + "learning_rate": 2.9827167497193367e-06, + "logits/chosen": -2.9335103034973145, + "logits/rejected": -2.7946219444274902, + "logps/chosen": -140.57815551757812, + "logps/rejected": -1278.376220703125, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6617147922515869, + "rewards/margins": 11.685511589050293, + "rewards/rejected": -12.347227096557617, + "step": 8290 + }, + { + "epoch": 0.49, + "learning_rate": 2.9776099082915954e-06, + "logits/chosen": -2.911815643310547, + "logits/rejected": -2.742642402648926, + "logps/chosen": -163.64859008789062, + "logps/rejected": -1300.9970703125, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9327988624572754, + "rewards/margins": 11.63986873626709, + "rewards/rejected": -12.572668075561523, + "step": 8300 + }, + { + "epoch": 0.5, + "learning_rate": 2.9725009975718845e-06, + "logits/chosen": -2.8979454040527344, + "logits/rejected": -2.7175180912017822, + "logps/chosen": -154.80947875976562, + "logps/rejected": -1268.8074951171875, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8442990183830261, + "rewards/margins": 11.391578674316406, + "rewards/rejected": -12.235878944396973, + "step": 8310 + }, + { + "epoch": 0.5, + "learning_rate": 2.9673900396950622e-06, + "logits/chosen": -2.931565046310425, + "logits/rejected": -2.8107380867004395, + "logps/chosen": -162.10623168945312, + "logps/rejected": -1088.323486328125, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8948472142219543, + "rewards/margins": 9.552961349487305, + "rewards/rejected": -10.447807312011719, + "step": 8320 + }, + { + "epoch": 0.5, + "learning_rate": 2.9622770568048577e-06, + "logits/chosen": -2.9129998683929443, + "logits/rejected": -2.7176756858825684, + "logps/chosen": -160.28585815429688, + "logps/rejected": -1199.1192626953125, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8855406641960144, + "rewards/margins": 10.659804344177246, + "rewards/rejected": -11.5453462600708, + "step": 8330 + }, + { + "epoch": 0.5, + "learning_rate": 2.9571620710537726e-06, + "logits/chosen": -2.9029946327209473, + "logits/rejected": -2.7773029804229736, + "logps/chosen": -156.53988647460938, + "logps/rejected": -1178.3660888671875, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.802294135093689, + "rewards/margins": 10.541152000427246, + "rewards/rejected": -11.343446731567383, + "step": 8340 + }, + { + "epoch": 0.5, + "learning_rate": 2.9520451046029862e-06, + "logits/chosen": -2.890559196472168, + "logits/rejected": -2.759887933731079, + "logps/chosen": -154.37278747558594, + "logps/rejected": -1227.537353515625, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8512604832649231, + "rewards/margins": 10.97917366027832, + "rewards/rejected": -11.83043384552002, + "step": 8350 + }, + { + "epoch": 0.5, + "learning_rate": 2.9469261796222608e-06, + "logits/chosen": -2.8820786476135254, + "logits/rejected": -2.721764087677002, + "logps/chosen": -151.35205078125, + "logps/rejected": -1162.476806640625, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8646419644355774, + "rewards/margins": 10.332788467407227, + "rewards/rejected": -11.197429656982422, + "step": 8360 + }, + { + "epoch": 0.5, + "learning_rate": 2.9418053182898428e-06, + "logits/chosen": -2.898127317428589, + "logits/rejected": -2.729259729385376, + "logps/chosen": -139.13018798828125, + "logps/rejected": -1169.8662109375, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.684096097946167, + "rewards/margins": 10.566595077514648, + "rewards/rejected": -11.250690460205078, + "step": 8370 + }, + { + "epoch": 0.5, + "learning_rate": 2.936682542792367e-06, + "logits/chosen": -2.9057202339172363, + "logits/rejected": -2.7429583072662354, + "logps/chosen": -131.05111694335938, + "logps/rejected": -1136.9920654296875, + "loss": 0.0244, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.628420352935791, + "rewards/margins": 10.316336631774902, + "rewards/rejected": -10.944757461547852, + "step": 8380 + }, + { + "epoch": 0.5, + "learning_rate": 2.9315578753247632e-06, + "logits/chosen": -2.9237546920776367, + "logits/rejected": -2.7132420539855957, + "logps/chosen": -134.4027862548828, + "logps/rejected": -1118.904052734375, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5988486409187317, + "rewards/margins": 10.142118453979492, + "rewards/rejected": -10.740968704223633, + "step": 8390 + }, + { + "epoch": 0.5, + "learning_rate": 2.9264313380901586e-06, + "logits/chosen": -2.923072338104248, + "logits/rejected": -2.7244014739990234, + "logps/chosen": -127.77690124511719, + "logps/rejected": -1257.849609375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5060471892356873, + "rewards/margins": 11.631449699401855, + "rewards/rejected": -12.137496948242188, + "step": 8400 + }, + { + "epoch": 0.5, + "learning_rate": 2.921302953299781e-06, + "logits/chosen": -2.894197940826416, + "logits/rejected": -2.6894216537475586, + "logps/chosen": -142.1927947998047, + "logps/rejected": -1178.7689208984375, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7495073080062866, + "rewards/margins": 10.590059280395508, + "rewards/rejected": -11.339567184448242, + "step": 8410 + }, + { + "epoch": 0.5, + "learning_rate": 2.916172743172861e-06, + "logits/chosen": -2.8771791458129883, + "logits/rejected": -2.72314453125, + "logps/chosen": -180.23348999023438, + "logps/rejected": -1090.023681640625, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0421350002288818, + "rewards/margins": 9.425169944763184, + "rewards/rejected": -10.467306137084961, + "step": 8420 + }, + { + "epoch": 0.5, + "learning_rate": 2.911040729936542e-06, + "logits/chosen": -2.904419183731079, + "logits/rejected": -2.737999439239502, + "logps/chosen": -140.25180053710938, + "logps/rejected": -1127.528076171875, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7147947549819946, + "rewards/margins": 10.117956161499023, + "rewards/rejected": -10.83275032043457, + "step": 8430 + }, + { + "epoch": 0.5, + "learning_rate": 2.905906935825774e-06, + "logits/chosen": -2.8962762355804443, + "logits/rejected": -2.665329694747925, + "logps/chosen": -115.6053695678711, + "logps/rejected": -1215.293212890625, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4125330448150635, + "rewards/margins": 11.294572830200195, + "rewards/rejected": -11.70710563659668, + "step": 8440 + }, + { + "epoch": 0.5, + "learning_rate": 2.900771383083227e-06, + "logits/chosen": -2.902400255203247, + "logits/rejected": -2.688458204269409, + "logps/chosen": -112.76815032958984, + "logps/rejected": -1184.0489501953125, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3807516098022461, + "rewards/margins": 11.021198272705078, + "rewards/rejected": -11.401948928833008, + "step": 8450 + }, + { + "epoch": 0.5, + "learning_rate": 2.895634093959189e-06, + "logits/chosen": -2.8728690147399902, + "logits/rejected": -2.6832470893859863, + "logps/chosen": -95.8866958618164, + "logps/rejected": -1126.6212158203125, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24849529564380646, + "rewards/margins": 10.576570510864258, + "rewards/rejected": -10.825065612792969, + "step": 8460 + }, + { + "epoch": 0.51, + "learning_rate": 2.8904950907114715e-06, + "logits/chosen": -2.919908285140991, + "logits/rejected": -2.7099380493164062, + "logps/chosen": -102.00383758544922, + "logps/rejected": -1214.02587890625, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3236317038536072, + "rewards/margins": 11.365498542785645, + "rewards/rejected": -11.689130783081055, + "step": 8470 + }, + { + "epoch": 0.51, + "learning_rate": 2.885354395605311e-06, + "logits/chosen": -2.9085373878479004, + "logits/rejected": -2.699601411819458, + "logps/chosen": -115.09901428222656, + "logps/rejected": -1174.053955078125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45333442091941833, + "rewards/margins": 10.83600902557373, + "rewards/rejected": -11.289342880249023, + "step": 8480 + }, + { + "epoch": 0.51, + "learning_rate": 2.880212030913276e-06, + "logits/chosen": -2.88631272315979, + "logits/rejected": -2.7022931575775146, + "logps/chosen": -119.80062103271484, + "logps/rejected": -1064.912841796875, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5223627090454102, + "rewards/margins": 9.709166526794434, + "rewards/rejected": -10.231529235839844, + "step": 8490 + }, + { + "epoch": 0.51, + "learning_rate": 2.875068018915169e-06, + "logits/chosen": -2.864041805267334, + "logits/rejected": -2.678987741470337, + "logps/chosen": -112.8697280883789, + "logps/rejected": -1098.531005859375, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4213559627532959, + "rewards/margins": 10.123356819152832, + "rewards/rejected": -10.54471206665039, + "step": 8500 + }, + { + "epoch": 0.51, + "learning_rate": 2.8699223818979274e-06, + "logits/chosen": -2.861661195755005, + "logits/rejected": -2.68904972076416, + "logps/chosen": -116.3563232421875, + "logps/rejected": -1156.1903076171875, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4229259490966797, + "rewards/margins": 10.690869331359863, + "rewards/rejected": -11.11379623413086, + "step": 8510 + }, + { + "epoch": 0.51, + "learning_rate": 2.8647751421555313e-06, + "logits/chosen": -2.881359577178955, + "logits/rejected": -2.681013822555542, + "logps/chosen": -121.25843811035156, + "logps/rejected": -1274.0323486328125, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4660007059574127, + "rewards/margins": 11.827128410339355, + "rewards/rejected": -12.293130874633789, + "step": 8520 + }, + { + "epoch": 0.51, + "learning_rate": 2.859626321988903e-06, + "logits/chosen": -2.8887641429901123, + "logits/rejected": -2.6403064727783203, + "logps/chosen": -135.89137268066406, + "logps/rejected": -1120.910400390625, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6892722249031067, + "rewards/margins": 10.080706596374512, + "rewards/rejected": -10.769978523254395, + "step": 8530 + }, + { + "epoch": 0.51, + "learning_rate": 2.8544759437058135e-06, + "logits/chosen": -2.8979439735412598, + "logits/rejected": -2.6841869354248047, + "logps/chosen": -170.7774200439453, + "logps/rejected": -1199.5584716796875, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.995023250579834, + "rewards/margins": 10.565732955932617, + "rewards/rejected": -11.56075668334961, + "step": 8540 + }, + { + "epoch": 0.51, + "learning_rate": 2.8493240296207835e-06, + "logits/chosen": -2.843690872192383, + "logits/rejected": -2.6437625885009766, + "logps/chosen": -223.54745483398438, + "logps/rejected": -1286.1190185546875, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5435632467269897, + "rewards/margins": 10.872712135314941, + "rewards/rejected": -12.416275978088379, + "step": 8550 + }, + { + "epoch": 0.51, + "learning_rate": 2.844170602054989e-06, + "logits/chosen": -2.8815646171569824, + "logits/rejected": -2.62292218208313, + "logps/chosen": -206.5679473876953, + "logps/rejected": -1223.450927734375, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3406312465667725, + "rewards/margins": 10.451102256774902, + "rewards/rejected": -11.79173469543457, + "step": 8560 + }, + { + "epoch": 0.51, + "learning_rate": 2.8390156833361616e-06, + "logits/chosen": -2.8827974796295166, + "logits/rejected": -2.6515748500823975, + "logps/chosen": -216.79373168945312, + "logps/rejected": -1198.4493408203125, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3943380117416382, + "rewards/margins": 10.1595458984375, + "rewards/rejected": -11.55388355255127, + "step": 8570 + }, + { + "epoch": 0.51, + "learning_rate": 2.833859295798495e-06, + "logits/chosen": -2.840674877166748, + "logits/rejected": -2.6365323066711426, + "logps/chosen": -193.5364990234375, + "logps/rejected": -1295.1944580078125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1920167207717896, + "rewards/margins": 11.305582046508789, + "rewards/rejected": -12.497599601745605, + "step": 8580 + }, + { + "epoch": 0.51, + "learning_rate": 2.828701461782546e-06, + "logits/chosen": -2.8499903678894043, + "logits/rejected": -2.663597583770752, + "logps/chosen": -180.19314575195312, + "logps/rejected": -1165.320068359375, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1210962533950806, + "rewards/margins": 10.096768379211426, + "rewards/rejected": -11.217864036560059, + "step": 8590 + }, + { + "epoch": 0.51, + "learning_rate": 2.8235422036351384e-06, + "logits/chosen": -2.842646837234497, + "logits/rejected": -2.6701502799987793, + "logps/chosen": -167.73040771484375, + "logps/rejected": -1202.8580322265625, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0205423831939697, + "rewards/margins": 10.567150115966797, + "rewards/rejected": -11.587693214416504, + "step": 8600 + }, + { + "epoch": 0.51, + "learning_rate": 2.818381543709267e-06, + "logits/chosen": -2.8757827281951904, + "logits/rejected": -2.644541025161743, + "logps/chosen": -179.77206420898438, + "logps/rejected": -1292.0792236328125, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0578577518463135, + "rewards/margins": 11.416297912597656, + "rewards/rejected": -12.474156379699707, + "step": 8610 + }, + { + "epoch": 0.51, + "learning_rate": 2.813219504363998e-06, + "logits/chosen": -2.8929104804992676, + "logits/rejected": -2.6589672565460205, + "logps/chosen": -186.99432373046875, + "logps/rejected": -1229.5103759765625, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.116969347000122, + "rewards/margins": 10.737485885620117, + "rewards/rejected": -11.854455947875977, + "step": 8620 + }, + { + "epoch": 0.51, + "learning_rate": 2.8080561079643758e-06, + "logits/chosen": -2.8735790252685547, + "logits/rejected": -2.6584935188293457, + "logps/chosen": -177.3016815185547, + "logps/rejected": -1353.6968994140625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1050446033477783, + "rewards/margins": 12.000685691833496, + "rewards/rejected": -13.105731010437012, + "step": 8630 + }, + { + "epoch": 0.52, + "learning_rate": 2.802891376881325e-06, + "logits/chosen": -2.859884738922119, + "logits/rejected": -2.6368844509124756, + "logps/chosen": -164.41403198242188, + "logps/rejected": -1120.52392578125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9209758043289185, + "rewards/margins": 9.83740520477295, + "rewards/rejected": -10.758380889892578, + "step": 8640 + }, + { + "epoch": 0.52, + "learning_rate": 2.7977253334915495e-06, + "logits/chosen": -2.848546028137207, + "logits/rejected": -2.6064257621765137, + "logps/chosen": -165.62997436523438, + "logps/rejected": -1182.0648193359375, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9194143414497375, + "rewards/margins": 10.454706192016602, + "rewards/rejected": -11.374120712280273, + "step": 8650 + }, + { + "epoch": 0.52, + "learning_rate": 2.7925580001774422e-06, + "logits/chosen": -2.8367691040039062, + "logits/rejected": -2.595975160598755, + "logps/chosen": -146.14683532714844, + "logps/rejected": -1265.567626953125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6889903545379639, + "rewards/margins": 11.512295722961426, + "rewards/rejected": -12.201287269592285, + "step": 8660 + }, + { + "epoch": 0.52, + "learning_rate": 2.787389399326984e-06, + "logits/chosen": -2.8577499389648438, + "logits/rejected": -2.6285009384155273, + "logps/chosen": -146.45794677734375, + "logps/rejected": -1191.8450927734375, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7575078010559082, + "rewards/margins": 10.720059394836426, + "rewards/rejected": -11.477567672729492, + "step": 8670 + }, + { + "epoch": 0.52, + "learning_rate": 2.7822195533336466e-06, + "logits/chosen": -2.85831880569458, + "logits/rejected": -2.659726142883301, + "logps/chosen": -140.2643585205078, + "logps/rejected": -1153.3939208984375, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6969577074050903, + "rewards/margins": 10.404402732849121, + "rewards/rejected": -11.101360321044922, + "step": 8680 + }, + { + "epoch": 0.52, + "learning_rate": 2.7770484845962976e-06, + "logits/chosen": -2.908207416534424, + "logits/rejected": -2.667222499847412, + "logps/chosen": -142.89402770996094, + "logps/rejected": -1020.4970703125, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7316961288452148, + "rewards/margins": 9.035357475280762, + "rewards/rejected": -9.767054557800293, + "step": 8690 + }, + { + "epoch": 0.52, + "learning_rate": 2.7718762155191015e-06, + "logits/chosen": -2.884187936782837, + "logits/rejected": -2.6048455238342285, + "logps/chosen": -130.8945770263672, + "logps/rejected": -1101.829833984375, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5116895437240601, + "rewards/margins": 10.068532943725586, + "rewards/rejected": -10.580222129821777, + "step": 8700 + }, + { + "epoch": 0.52, + "learning_rate": 2.766702768511423e-06, + "logits/chosen": -2.8269100189208984, + "logits/rejected": -2.639131546020508, + "logps/chosen": -109.55877685546875, + "logps/rejected": -989.4346923828125, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42994171380996704, + "rewards/margins": 9.02569580078125, + "rewards/rejected": -9.455636978149414, + "step": 8710 + }, + { + "epoch": 0.52, + "learning_rate": 2.7615281659877304e-06, + "logits/chosen": -2.884887456893921, + "logits/rejected": -2.668712854385376, + "logps/chosen": -125.5203857421875, + "logps/rejected": -1095.218505859375, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5232694745063782, + "rewards/margins": 9.987313270568848, + "rewards/rejected": -10.510583877563477, + "step": 8720 + }, + { + "epoch": 0.52, + "learning_rate": 2.7563524303675005e-06, + "logits/chosen": -2.804560899734497, + "logits/rejected": -2.530867099761963, + "logps/chosen": -150.4720001220703, + "logps/rejected": -1232.0577392578125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7528031468391418, + "rewards/margins": 11.117652893066406, + "rewards/rejected": -11.870455741882324, + "step": 8730 + }, + { + "epoch": 0.52, + "learning_rate": 2.7511755840751165e-06, + "logits/chosen": -2.8314120769500732, + "logits/rejected": -2.6416478157043457, + "logps/chosen": -129.7180938720703, + "logps/rejected": -1180.48974609375, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6153192520141602, + "rewards/margins": 10.765087127685547, + "rewards/rejected": -11.380406379699707, + "step": 8740 + }, + { + "epoch": 0.52, + "learning_rate": 2.7459976495397738e-06, + "logits/chosen": -2.8279497623443604, + "logits/rejected": -2.5734431743621826, + "logps/chosen": -159.1131134033203, + "logps/rejected": -1189.624267578125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8376861810684204, + "rewards/margins": 10.623645782470703, + "rewards/rejected": -11.461332321166992, + "step": 8750 + }, + { + "epoch": 0.52, + "learning_rate": 2.7408186491953862e-06, + "logits/chosen": -2.851015567779541, + "logits/rejected": -2.5913257598876953, + "logps/chosen": -142.84890747070312, + "logps/rejected": -1179.016845703125, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7569302916526794, + "rewards/margins": 10.598562240600586, + "rewards/rejected": -11.355490684509277, + "step": 8760 + }, + { + "epoch": 0.52, + "learning_rate": 2.735638605480482e-06, + "logits/chosen": -2.8595190048217773, + "logits/rejected": -2.582789182662964, + "logps/chosen": -144.33029174804688, + "logps/rejected": -1175.071044921875, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7189782857894897, + "rewards/margins": 10.575189590454102, + "rewards/rejected": -11.294167518615723, + "step": 8770 + }, + { + "epoch": 0.52, + "learning_rate": 2.730457540838109e-06, + "logits/chosen": -2.81011962890625, + "logits/rejected": -2.5828940868377686, + "logps/chosen": -129.65719604492188, + "logps/rejected": -1138.3101806640625, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6193026900291443, + "rewards/margins": 10.32276439666748, + "rewards/rejected": -10.942068099975586, + "step": 8780 + }, + { + "epoch": 0.52, + "learning_rate": 2.725275477715743e-06, + "logits/chosen": -2.8384578227996826, + "logits/rejected": -2.548628568649292, + "logps/chosen": -172.1205596923828, + "logps/rejected": -1171.3193359375, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0557301044464111, + "rewards/margins": 10.209169387817383, + "rewards/rejected": -11.264899253845215, + "step": 8790 + }, + { + "epoch": 0.52, + "learning_rate": 2.7200924385651805e-06, + "logits/chosen": -2.8204004764556885, + "logits/rejected": -2.549956798553467, + "logps/chosen": -207.0591278076172, + "logps/rejected": -1209.3974609375, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4057868719100952, + "rewards/margins": 10.251618385314941, + "rewards/rejected": -11.657405853271484, + "step": 8800 + }, + { + "epoch": 0.53, + "learning_rate": 2.7149084458424497e-06, + "logits/chosen": -2.8310275077819824, + "logits/rejected": -2.6350979804992676, + "logps/chosen": -248.27474975585938, + "logps/rejected": -1290.673095703125, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8052504062652588, + "rewards/margins": 10.661766052246094, + "rewards/rejected": -12.46701717376709, + "step": 8810 + }, + { + "epoch": 0.53, + "learning_rate": 2.70972352200771e-06, + "logits/chosen": -2.8514134883880615, + "logits/rejected": -2.5841760635375977, + "logps/chosen": -240.83657836914062, + "logps/rejected": -1387.0308837890625, + "loss": 0.0261, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6830450296401978, + "rewards/margins": 11.736095428466797, + "rewards/rejected": -13.419140815734863, + "step": 8820 + }, + { + "epoch": 0.53, + "learning_rate": 2.7045376895251544e-06, + "logits/chosen": -2.824413776397705, + "logits/rejected": -2.594223737716675, + "logps/chosen": -171.55313110351562, + "logps/rejected": -1272.257080078125, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1000640392303467, + "rewards/margins": 11.190762519836426, + "rewards/rejected": -12.290824890136719, + "step": 8830 + }, + { + "epoch": 0.53, + "learning_rate": 2.6993509708629133e-06, + "logits/chosen": -2.858731746673584, + "logits/rejected": -2.634040117263794, + "logps/chosen": -211.41061401367188, + "logps/rejected": -1314.533203125, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.388708472251892, + "rewards/margins": 11.303516387939453, + "rewards/rejected": -12.69222354888916, + "step": 8840 + }, + { + "epoch": 0.53, + "learning_rate": 2.694163388492957e-06, + "logits/chosen": -2.8316709995269775, + "logits/rejected": -2.6018905639648438, + "logps/chosen": -169.09791564941406, + "logps/rejected": -1177.427490234375, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9727762937545776, + "rewards/margins": 10.349662780761719, + "rewards/rejected": -11.322439193725586, + "step": 8850 + }, + { + "epoch": 0.53, + "learning_rate": 2.6889749648909946e-06, + "logits/chosen": -2.8448777198791504, + "logits/rejected": -2.56866717338562, + "logps/chosen": -123.35162353515625, + "logps/rejected": -1396.155517578125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5139948725700378, + "rewards/margins": 12.987811088562012, + "rewards/rejected": -13.501806259155273, + "step": 8860 + }, + { + "epoch": 0.53, + "learning_rate": 2.6837857225363837e-06, + "logits/chosen": -2.830170154571533, + "logits/rejected": -2.59755277633667, + "logps/chosen": -116.15406799316406, + "logps/rejected": -1179.905517578125, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5133495926856995, + "rewards/margins": 10.840788841247559, + "rewards/rejected": -11.35413932800293, + "step": 8870 + }, + { + "epoch": 0.53, + "learning_rate": 2.6785956839120294e-06, + "logits/chosen": -2.827650308609009, + "logits/rejected": -2.6163816452026367, + "logps/chosen": -117.0268783569336, + "logps/rejected": -1123.203369140625, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.510259747505188, + "rewards/margins": 10.284109115600586, + "rewards/rejected": -10.794370651245117, + "step": 8880 + }, + { + "epoch": 0.53, + "learning_rate": 2.6734048715042824e-06, + "logits/chosen": -2.817591905593872, + "logits/rejected": -2.5745701789855957, + "logps/chosen": -111.74925231933594, + "logps/rejected": -1238.2119140625, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4261833131313324, + "rewards/margins": 11.530172348022461, + "rewards/rejected": -11.9563570022583, + "step": 8890 + }, + { + "epoch": 0.53, + "learning_rate": 2.668213307802851e-06, + "logits/chosen": -2.806105136871338, + "logits/rejected": -2.591003894805908, + "logps/chosen": -115.70748138427734, + "logps/rejected": -1178.735107421875, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40064820647239685, + "rewards/margins": 10.935684204101562, + "rewards/rejected": -11.336334228515625, + "step": 8900 + }, + { + "epoch": 0.53, + "learning_rate": 2.663021015300695e-06, + "logits/chosen": -2.8011221885681152, + "logits/rejected": -2.572206497192383, + "logps/chosen": -109.93327331542969, + "logps/rejected": -1094.0491943359375, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3877597451210022, + "rewards/margins": 10.114436149597168, + "rewards/rejected": -10.502195358276367, + "step": 8910 + }, + { + "epoch": 0.53, + "learning_rate": 2.657828016493933e-06, + "logits/chosen": -2.80245041847229, + "logits/rejected": -2.573734760284424, + "logps/chosen": -126.8602066040039, + "logps/rejected": -1155.0802001953125, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5544692873954773, + "rewards/margins": 10.555421829223633, + "rewards/rejected": -11.109891891479492, + "step": 8920 + }, + { + "epoch": 0.53, + "learning_rate": 2.6526343338817445e-06, + "logits/chosen": -2.8176636695861816, + "logits/rejected": -2.562530517578125, + "logps/chosen": -151.31910705566406, + "logps/rejected": -1200.107666015625, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7892540693283081, + "rewards/margins": 10.772626876831055, + "rewards/rejected": -11.561881065368652, + "step": 8930 + }, + { + "epoch": 0.53, + "learning_rate": 2.647439989966272e-06, + "logits/chosen": -2.8049850463867188, + "logits/rejected": -2.5129425525665283, + "logps/chosen": -114.68141174316406, + "logps/rejected": -1238.9661865234375, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47490543127059937, + "rewards/margins": 11.465176582336426, + "rewards/rejected": -11.940081596374512, + "step": 8940 + }, + { + "epoch": 0.53, + "learning_rate": 2.6422450072525198e-06, + "logits/chosen": -2.823791265487671, + "logits/rejected": -2.569718360900879, + "logps/chosen": -108.46688079833984, + "logps/rejected": -1020.7151489257812, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41460657119750977, + "rewards/margins": 9.34644889831543, + "rewards/rejected": -9.761054992675781, + "step": 8950 + }, + { + "epoch": 0.53, + "learning_rate": 2.6370494082482632e-06, + "logits/chosen": -2.809652805328369, + "logits/rejected": -2.5150139331817627, + "logps/chosen": -119.2606201171875, + "logps/rejected": -1105.090576171875, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4620642066001892, + "rewards/margins": 10.13792610168457, + "rewards/rejected": -10.599990844726562, + "step": 8960 + }, + { + "epoch": 0.53, + "learning_rate": 2.6318532154639474e-06, + "logits/chosen": -2.7741401195526123, + "logits/rejected": -2.5240073204040527, + "logps/chosen": -126.09135437011719, + "logps/rejected": -1223.4954833984375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.522668182849884, + "rewards/margins": 11.278470993041992, + "rewards/rejected": -11.801138877868652, + "step": 8970 + }, + { + "epoch": 0.54, + "learning_rate": 2.626656451412588e-06, + "logits/chosen": -2.8281302452087402, + "logits/rejected": -2.5706429481506348, + "logps/chosen": -118.5013198852539, + "logps/rejected": -1252.6693115234375, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4158239960670471, + "rewards/margins": 11.657991409301758, + "rewards/rejected": -12.07381534576416, + "step": 8980 + }, + { + "epoch": 0.54, + "learning_rate": 2.6214591386096782e-06, + "logits/chosen": -2.7473807334899902, + "logits/rejected": -2.478829860687256, + "logps/chosen": -101.12090301513672, + "logps/rejected": -1214.964599609375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31386569142341614, + "rewards/margins": 11.400246620178223, + "rewards/rejected": -11.714112281799316, + "step": 8990 + }, + { + "epoch": 0.54, + "learning_rate": 2.6162612995730874e-06, + "logits/chosen": -2.8409926891326904, + "logits/rejected": -2.567551374435425, + "logps/chosen": -114.22438049316406, + "logps/rejected": -1286.527099609375, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3678627610206604, + "rewards/margins": 12.063667297363281, + "rewards/rejected": -12.431530952453613, + "step": 9000 + }, + { + "epoch": 0.54, + "learning_rate": 2.6110629568229647e-06, + "logits/chosen": -2.816166400909424, + "logits/rejected": -2.54646635055542, + "logps/chosen": -105.74687194824219, + "logps/rejected": -1147.75146484375, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33271974325180054, + "rewards/margins": 10.716184616088867, + "rewards/rejected": -11.048904418945312, + "step": 9010 + }, + { + "epoch": 0.54, + "learning_rate": 2.6058641328816425e-06, + "logits/chosen": -2.8309741020202637, + "logits/rejected": -2.5696823596954346, + "logps/chosen": -127.2796859741211, + "logps/rejected": -1033.6153564453125, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5708521604537964, + "rewards/margins": 9.325971603393555, + "rewards/rejected": -9.896822929382324, + "step": 9020 + }, + { + "epoch": 0.54, + "learning_rate": 2.6006648502735384e-06, + "logits/chosen": -2.801985263824463, + "logits/rejected": -2.563275098800659, + "logps/chosen": -102.72383117675781, + "logps/rejected": -1021.3406982421875, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3138514757156372, + "rewards/margins": 9.46950626373291, + "rewards/rejected": -9.783357620239258, + "step": 9030 + }, + { + "epoch": 0.54, + "learning_rate": 2.5954651315250543e-06, + "logits/chosen": -2.792391538619995, + "logits/rejected": -2.4843857288360596, + "logps/chosen": -105.3338851928711, + "logps/rejected": -1155.8475341796875, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3732788860797882, + "rewards/margins": 10.75007438659668, + "rewards/rejected": -11.12335205078125, + "step": 9040 + }, + { + "epoch": 0.54, + "learning_rate": 2.5902649991644855e-06, + "logits/chosen": -2.7931556701660156, + "logits/rejected": -2.510646104812622, + "logps/chosen": -123.06136322021484, + "logps/rejected": -1246.3153076171875, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5123059749603271, + "rewards/margins": 11.507547378540039, + "rewards/rejected": -12.019853591918945, + "step": 9050 + }, + { + "epoch": 0.54, + "learning_rate": 2.5850644757219177e-06, + "logits/chosen": -2.795856475830078, + "logits/rejected": -2.4899067878723145, + "logps/chosen": -128.98214721679688, + "logps/rejected": -1215.587646484375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5626493692398071, + "rewards/margins": 11.163839340209961, + "rewards/rejected": -11.726489067077637, + "step": 9060 + }, + { + "epoch": 0.54, + "learning_rate": 2.5798635837291304e-06, + "logits/chosen": -2.774034023284912, + "logits/rejected": -2.488168478012085, + "logps/chosen": -124.50978088378906, + "logps/rejected": -1135.6466064453125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5497738718986511, + "rewards/margins": 10.363019943237305, + "rewards/rejected": -10.91279411315918, + "step": 9070 + }, + { + "epoch": 0.54, + "learning_rate": 2.5746623457194996e-06, + "logits/chosen": -2.7708590030670166, + "logits/rejected": -2.4828476905822754, + "logps/chosen": -134.91110229492188, + "logps/rejected": -1292.5389404296875, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6623948216438293, + "rewards/margins": 11.825265884399414, + "rewards/rejected": -12.48766040802002, + "step": 9080 + }, + { + "epoch": 0.54, + "learning_rate": 2.569460784227903e-06, + "logits/chosen": -2.7878754138946533, + "logits/rejected": -2.476685047149658, + "logps/chosen": -168.18325805664062, + "logps/rejected": -1254.955810546875, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.993513286113739, + "rewards/margins": 11.11749267578125, + "rewards/rejected": -12.111004829406738, + "step": 9090 + }, + { + "epoch": 0.54, + "learning_rate": 2.5642589217906164e-06, + "logits/chosen": -2.7759673595428467, + "logits/rejected": -2.442605972290039, + "logps/chosen": -164.1595001220703, + "logps/rejected": -1352.0487060546875, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9402977228164673, + "rewards/margins": 12.136119842529297, + "rewards/rejected": -13.076417922973633, + "step": 9100 + }, + { + "epoch": 0.54, + "learning_rate": 2.559056780945223e-06, + "logits/chosen": -2.801144599914551, + "logits/rejected": -2.4511032104492188, + "logps/chosen": -183.38833618164062, + "logps/rejected": -1290.93603515625, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1023266315460205, + "rewards/margins": 11.362133026123047, + "rewards/rejected": -12.464460372924805, + "step": 9110 + }, + { + "epoch": 0.54, + "learning_rate": 2.5538543842305085e-06, + "logits/chosen": -2.7863965034484863, + "logits/rejected": -2.425858974456787, + "logps/chosen": -187.49612426757812, + "logps/rejected": -1302.1209716796875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1713062524795532, + "rewards/margins": 11.404998779296875, + "rewards/rejected": -12.576306343078613, + "step": 9120 + }, + { + "epoch": 0.54, + "learning_rate": 2.5486517541863696e-06, + "logits/chosen": -2.7975218296051025, + "logits/rejected": -2.4785380363464355, + "logps/chosen": -182.64633178710938, + "logps/rejected": -1255.8494873046875, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.065992832183838, + "rewards/margins": 11.067254066467285, + "rewards/rejected": -12.133247375488281, + "step": 9130 + }, + { + "epoch": 0.55, + "learning_rate": 2.5434489133537154e-06, + "logits/chosen": -2.802154541015625, + "logits/rejected": -2.4505929946899414, + "logps/chosen": -189.68191528320312, + "logps/rejected": -1365.123291015625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1531507968902588, + "rewards/margins": 12.054038047790527, + "rewards/rejected": -13.207188606262207, + "step": 9140 + }, + { + "epoch": 0.55, + "learning_rate": 2.5382458842743634e-06, + "logits/chosen": -2.7340104579925537, + "logits/rejected": -2.4547784328460693, + "logps/chosen": -179.56491088867188, + "logps/rejected": -1334.59765625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1335508823394775, + "rewards/margins": 11.76144790649414, + "rewards/rejected": -12.894998550415039, + "step": 9150 + }, + { + "epoch": 0.55, + "learning_rate": 2.53304268949095e-06, + "logits/chosen": -2.7387959957122803, + "logits/rejected": -2.469914197921753, + "logps/chosen": -182.5727081298828, + "logps/rejected": -1299.2421875, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1277105808258057, + "rewards/margins": 11.406871795654297, + "rewards/rejected": -12.534584045410156, + "step": 9160 + }, + { + "epoch": 0.55, + "learning_rate": 2.5278393515468312e-06, + "logits/chosen": -2.7731475830078125, + "logits/rejected": -2.401132583618164, + "logps/chosen": -172.27481079101562, + "logps/rejected": -1232.1500244140625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9934558868408203, + "rewards/margins": 10.876480102539062, + "rewards/rejected": -11.869935989379883, + "step": 9170 + }, + { + "epoch": 0.55, + "learning_rate": 2.5226358929859793e-06, + "logits/chosen": -2.8033432960510254, + "logits/rejected": -2.4295883178710938, + "logps/chosen": -208.0424041748047, + "logps/rejected": -1171.468994140625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3904781341552734, + "rewards/margins": 9.89474868774414, + "rewards/rejected": -11.28522777557373, + "step": 9180 + }, + { + "epoch": 0.55, + "learning_rate": 2.517432336352891e-06, + "logits/chosen": -2.7643942832946777, + "logits/rejected": -2.405329704284668, + "logps/chosen": -196.18893432617188, + "logps/rejected": -1373.046875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2262336015701294, + "rewards/margins": 12.06359577178955, + "rewards/rejected": -13.289830207824707, + "step": 9190 + }, + { + "epoch": 0.55, + "learning_rate": 2.5122287041924897e-06, + "logits/chosen": -2.773871660232544, + "logits/rejected": -2.459120035171509, + "logps/chosen": -154.21078491210938, + "logps/rejected": -1331.728515625, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8379950523376465, + "rewards/margins": 12.044095993041992, + "rewards/rejected": -12.88209056854248, + "step": 9200 + }, + { + "epoch": 0.55, + "learning_rate": 2.507025019050022e-06, + "logits/chosen": -2.7721176147460938, + "logits/rejected": -2.4530961513519287, + "logps/chosen": -180.89395141601562, + "logps/rejected": -1257.1702880859375, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1186147928237915, + "rewards/margins": 11.023232460021973, + "rewards/rejected": -12.141847610473633, + "step": 9210 + }, + { + "epoch": 0.55, + "learning_rate": 2.5018213034709683e-06, + "logits/chosen": -2.7496094703674316, + "logits/rejected": -2.391211748123169, + "logps/chosen": -146.93063354492188, + "logps/rejected": -1182.9371337890625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7501407265663147, + "rewards/margins": 10.64448356628418, + "rewards/rejected": -11.394624710083008, + "step": 9220 + }, + { + "epoch": 0.55, + "learning_rate": 2.496617580000937e-06, + "logits/chosen": -2.751349925994873, + "logits/rejected": -2.372615098953247, + "logps/chosen": -143.75375366210938, + "logps/rejected": -1195.037109375, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7807610034942627, + "rewards/margins": 10.714524269104004, + "rewards/rejected": -11.495285034179688, + "step": 9230 + }, + { + "epoch": 0.55, + "learning_rate": 2.491413871185574e-06, + "logits/chosen": -2.7620747089385986, + "logits/rejected": -2.4743142127990723, + "logps/chosen": -162.22793579101562, + "logps/rejected": -1202.9722900390625, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9299749135971069, + "rewards/margins": 10.657004356384277, + "rewards/rejected": -11.586979866027832, + "step": 9240 + }, + { + "epoch": 0.55, + "learning_rate": 2.486210199570459e-06, + "logits/chosen": -2.766876697540283, + "logits/rejected": -2.427877902984619, + "logps/chosen": -151.55001831054688, + "logps/rejected": -1301.142822265625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8024405241012573, + "rewards/margins": 11.776586532592773, + "rewards/rejected": -12.57902717590332, + "step": 9250 + }, + { + "epoch": 0.55, + "learning_rate": 2.4810065877010137e-06, + "logits/chosen": -2.75538969039917, + "logits/rejected": -2.447815418243408, + "logps/chosen": -154.34844970703125, + "logps/rejected": -1244.7877197265625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7802685499191284, + "rewards/margins": 11.223494529724121, + "rewards/rejected": -12.003763198852539, + "step": 9260 + }, + { + "epoch": 0.55, + "learning_rate": 2.475803058122397e-06, + "logits/chosen": -2.788378953933716, + "logits/rejected": -2.407958507537842, + "logps/chosen": -122.7944564819336, + "logps/rejected": -1211.561279296875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5630209445953369, + "rewards/margins": 11.120402336120605, + "rewards/rejected": -11.68342399597168, + "step": 9270 + }, + { + "epoch": 0.55, + "learning_rate": 2.470599633379415e-06, + "logits/chosen": -2.7260079383850098, + "logits/rejected": -2.438333511352539, + "logps/chosen": -156.7364501953125, + "logps/rejected": -1286.427001953125, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8463515043258667, + "rewards/margins": 11.567557334899902, + "rewards/rejected": -12.413908958435059, + "step": 9280 + }, + { + "epoch": 0.55, + "learning_rate": 2.465396336016417e-06, + "logits/chosen": -2.7842154502868652, + "logits/rejected": -2.4713377952575684, + "logps/chosen": -132.51913452148438, + "logps/rejected": -1383.843505859375, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5946775674819946, + "rewards/margins": 12.794930458068848, + "rewards/rejected": -13.389608383178711, + "step": 9290 + }, + { + "epoch": 0.55, + "learning_rate": 2.460193188577201e-06, + "logits/chosen": -2.722109794616699, + "logits/rejected": -2.302485704421997, + "logps/chosen": -175.0625762939453, + "logps/rejected": -1191.669921875, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0287086963653564, + "rewards/margins": 10.449769973754883, + "rewards/rejected": -11.47847843170166, + "step": 9300 + }, + { + "epoch": 0.56, + "learning_rate": 2.454990213604917e-06, + "logits/chosen": -2.763545274734497, + "logits/rejected": -2.3513073921203613, + "logps/chosen": -131.9331817626953, + "logps/rejected": -1234.7012939453125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6387866735458374, + "rewards/margins": 11.270801544189453, + "rewards/rejected": -11.909588813781738, + "step": 9310 + }, + { + "epoch": 0.56, + "learning_rate": 2.449787433641965e-06, + "logits/chosen": -2.747870445251465, + "logits/rejected": -2.358887195587158, + "logps/chosen": -124.69953918457031, + "logps/rejected": -1216.431884765625, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5647678971290588, + "rewards/margins": 11.141289710998535, + "rewards/rejected": -11.706056594848633, + "step": 9320 + }, + { + "epoch": 0.56, + "learning_rate": 2.4445848712299027e-06, + "logits/chosen": -2.7382333278656006, + "logits/rejected": -2.403057813644409, + "logps/chosen": -151.37469482421875, + "logps/rejected": -1220.283935546875, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8205707669258118, + "rewards/margins": 10.940628051757812, + "rewards/rejected": -11.761198043823242, + "step": 9330 + }, + { + "epoch": 0.56, + "learning_rate": 2.4393825489093438e-06, + "logits/chosen": -2.746755599975586, + "logits/rejected": -2.336787700653076, + "logps/chosen": -138.52200317382812, + "logps/rejected": -1274.541748046875, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.691260039806366, + "rewards/margins": 11.603950500488281, + "rewards/rejected": -12.295210838317871, + "step": 9340 + }, + { + "epoch": 0.56, + "learning_rate": 2.434180489219863e-06, + "logits/chosen": -2.764892339706421, + "logits/rejected": -2.375894546508789, + "logps/chosen": -152.8723907470703, + "logps/rejected": -1170.1402587890625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.868782639503479, + "rewards/margins": 10.39822006225586, + "rewards/rejected": -11.267003059387207, + "step": 9350 + }, + { + "epoch": 0.56, + "learning_rate": 2.428978714699894e-06, + "logits/chosen": -2.722238540649414, + "logits/rejected": -2.366903305053711, + "logps/chosen": -167.43258666992188, + "logps/rejected": -1381.462158203125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9342926144599915, + "rewards/margins": 12.439630508422852, + "rewards/rejected": -13.373922348022461, + "step": 9360 + }, + { + "epoch": 0.56, + "learning_rate": 2.4237772478866403e-06, + "logits/chosen": -2.7401957511901855, + "logits/rejected": -2.2874555587768555, + "logps/chosen": -153.06741333007812, + "logps/rejected": -1248.850830078125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.777852475643158, + "rewards/margins": 11.264423370361328, + "rewards/rejected": -12.042276382446289, + "step": 9370 + }, + { + "epoch": 0.56, + "learning_rate": 2.4185761113159677e-06, + "logits/chosen": -2.7049171924591064, + "logits/rejected": -2.3371849060058594, + "logps/chosen": -148.55088806152344, + "logps/rejected": -1172.7884521484375, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.829466700553894, + "rewards/margins": 10.430952072143555, + "rewards/rejected": -11.260418891906738, + "step": 9380 + }, + { + "epoch": 0.56, + "learning_rate": 2.4133753275223114e-06, + "logits/chosen": -2.713712692260742, + "logits/rejected": -2.361161708831787, + "logps/chosen": -174.9651641845703, + "logps/rejected": -1227.4547119140625, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0444750785827637, + "rewards/margins": 10.787628173828125, + "rewards/rejected": -11.832103729248047, + "step": 9390 + }, + { + "epoch": 0.56, + "learning_rate": 2.4081749190385818e-06, + "logits/chosen": -2.7402079105377197, + "logits/rejected": -2.446700096130371, + "logps/chosen": -144.0089111328125, + "logps/rejected": -1308.43115234375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7618849873542786, + "rewards/margins": 11.874956130981445, + "rewards/rejected": -12.636838912963867, + "step": 9400 + }, + { + "epoch": 0.56, + "learning_rate": 2.402974908396059e-06, + "logits/chosen": -2.717909336090088, + "logits/rejected": -2.3127570152282715, + "logps/chosen": -156.04469299316406, + "logps/rejected": -1263.321533203125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8087388873100281, + "rewards/margins": 11.377927780151367, + "rewards/rejected": -12.186666488647461, + "step": 9410 + }, + { + "epoch": 0.56, + "learning_rate": 2.397775318124302e-06, + "logits/chosen": -2.6970152854919434, + "logits/rejected": -2.3667407035827637, + "logps/chosen": -150.6900177001953, + "logps/rejected": -1348.1600341796875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8574720621109009, + "rewards/margins": 12.17613697052002, + "rewards/rejected": -13.033609390258789, + "step": 9420 + }, + { + "epoch": 0.56, + "learning_rate": 2.3925761707510484e-06, + "logits/chosen": -2.7813735008239746, + "logits/rejected": -2.4709270000457764, + "logps/chosen": -153.06045532226562, + "logps/rejected": -1096.801025390625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8275460004806519, + "rewards/margins": 9.691169738769531, + "rewards/rejected": -10.518715858459473, + "step": 9430 + }, + { + "epoch": 0.56, + "learning_rate": 2.387377488802116e-06, + "logits/chosen": -2.67199969291687, + "logits/rejected": -2.2829158306121826, + "logps/chosen": -188.1942138671875, + "logps/rejected": -1222.33203125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.120816946029663, + "rewards/margins": 10.654254913330078, + "rewards/rejected": -11.77507209777832, + "step": 9440 + }, + { + "epoch": 0.56, + "learning_rate": 2.382179294801305e-06, + "logits/chosen": -2.743873357772827, + "logits/rejected": -2.3810336589813232, + "logps/chosen": -136.88839721679688, + "logps/rejected": -1203.7110595703125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6752172112464905, + "rewards/margins": 10.918087005615234, + "rewards/rejected": -11.593304634094238, + "step": 9450 + }, + { + "epoch": 0.56, + "learning_rate": 2.376981611270305e-06, + "logits/chosen": -2.6915335655212402, + "logits/rejected": -2.3777434825897217, + "logps/chosen": -150.03700256347656, + "logps/rejected": -1328.369873046875, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7870742678642273, + "rewards/margins": 12.051506042480469, + "rewards/rejected": -12.838579177856445, + "step": 9460 + }, + { + "epoch": 0.56, + "learning_rate": 2.3717844607285905e-06, + "logits/chosen": -2.7596230506896973, + "logits/rejected": -2.3682141304016113, + "logps/chosen": -152.27685546875, + "logps/rejected": -1317.722900390625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8494324684143066, + "rewards/margins": 11.87131404876709, + "rewards/rejected": -12.720746994018555, + "step": 9470 + }, + { + "epoch": 0.57, + "learning_rate": 2.3665878656933285e-06, + "logits/chosen": -2.7440428733825684, + "logits/rejected": -2.443173885345459, + "logps/chosen": -170.11737060546875, + "logps/rejected": -1196.157958984375, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.021935224533081, + "rewards/margins": 10.493707656860352, + "rewards/rejected": -11.515642166137695, + "step": 9480 + }, + { + "epoch": 0.57, + "learning_rate": 2.3613918486792777e-06, + "logits/chosen": -2.753974437713623, + "logits/rejected": -2.3040335178375244, + "logps/chosen": -167.3306884765625, + "logps/rejected": -1311.3167724609375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.975999653339386, + "rewards/margins": 11.682435989379883, + "rewards/rejected": -12.65843391418457, + "step": 9490 + }, + { + "epoch": 0.57, + "learning_rate": 2.3561964321986963e-06, + "logits/chosen": -2.740182399749756, + "logits/rejected": -2.3473961353302, + "logps/chosen": -140.16009521484375, + "logps/rejected": -1294.11328125, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.655563473701477, + "rewards/margins": 11.846887588500977, + "rewards/rejected": -12.50245189666748, + "step": 9500 + }, + { + "epoch": 0.57, + "learning_rate": 2.351001638761236e-06, + "logits/chosen": -2.7467072010040283, + "logits/rejected": -2.3630900382995605, + "logps/chosen": -169.1416473388672, + "logps/rejected": -1334.5823974609375, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0012924671173096, + "rewards/margins": 11.913200378417969, + "rewards/rejected": -12.9144926071167, + "step": 9510 + }, + { + "epoch": 0.57, + "learning_rate": 2.34580749087385e-06, + "logits/chosen": -2.724733829498291, + "logits/rejected": -2.36202335357666, + "logps/chosen": -146.73130798339844, + "logps/rejected": -1197.087890625, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7576860785484314, + "rewards/margins": 10.775758743286133, + "rewards/rejected": -11.533442497253418, + "step": 9520 + }, + { + "epoch": 0.57, + "learning_rate": 2.3406140110406984e-06, + "logits/chosen": -2.717862367630005, + "logits/rejected": -2.3901379108428955, + "logps/chosen": -132.48936462402344, + "logps/rejected": -1347.44140625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5981499552726746, + "rewards/margins": 12.440714836120605, + "rewards/rejected": -13.038862228393555, + "step": 9530 + }, + { + "epoch": 0.57, + "learning_rate": 2.3354212217630428e-06, + "logits/chosen": -2.768568515777588, + "logits/rejected": -2.32222318649292, + "logps/chosen": -167.62844848632812, + "logps/rejected": -1297.629638671875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0196685791015625, + "rewards/margins": 11.51185417175293, + "rewards/rejected": -12.531522750854492, + "step": 9540 + }, + { + "epoch": 0.57, + "learning_rate": 2.3302291455391525e-06, + "logits/chosen": -2.7186317443847656, + "logits/rejected": -2.332707166671753, + "logps/chosen": -173.63253784179688, + "logps/rejected": -1247.8470458984375, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9834288358688354, + "rewards/margins": 11.050291061401367, + "rewards/rejected": -12.033719062805176, + "step": 9550 + }, + { + "epoch": 0.57, + "learning_rate": 2.3250378048642117e-06, + "logits/chosen": -2.722168207168579, + "logits/rejected": -2.329042911529541, + "logps/chosen": -136.57289123535156, + "logps/rejected": -1243.9241943359375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6665438413619995, + "rewards/margins": 11.333455085754395, + "rewards/rejected": -11.999998092651367, + "step": 9560 + }, + { + "epoch": 0.57, + "learning_rate": 2.3198472222302144e-06, + "logits/chosen": -2.715280294418335, + "logits/rejected": -2.345092296600342, + "logps/chosen": -142.8782196044922, + "logps/rejected": -1219.1258544921875, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6935387849807739, + "rewards/margins": 11.052824974060059, + "rewards/rejected": -11.746365547180176, + "step": 9570 + }, + { + "epoch": 0.57, + "learning_rate": 2.3146574201258697e-06, + "logits/chosen": -2.7101492881774902, + "logits/rejected": -2.253700017929077, + "logps/chosen": -123.58675384521484, + "logps/rejected": -1261.567626953125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4841700494289398, + "rewards/margins": 11.675737380981445, + "rewards/rejected": -12.159907341003418, + "step": 9580 + }, + { + "epoch": 0.57, + "learning_rate": 2.309468421036509e-06, + "logits/chosen": -2.712399959564209, + "logits/rejected": -2.3561177253723145, + "logps/chosen": -137.35348510742188, + "logps/rejected": -1089.5350341796875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7036619782447815, + "rewards/margins": 9.754589080810547, + "rewards/rejected": -10.458251953125, + "step": 9590 + }, + { + "epoch": 0.57, + "learning_rate": 2.3042802474439805e-06, + "logits/chosen": -2.71528959274292, + "logits/rejected": -2.3430187702178955, + "logps/chosen": -147.6951446533203, + "logps/rejected": -1242.837158203125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7670568227767944, + "rewards/margins": 11.235685348510742, + "rewards/rejected": -12.002740859985352, + "step": 9600 + }, + { + "epoch": 0.57, + "learning_rate": 2.299092921826556e-06, + "logits/chosen": -2.7347283363342285, + "logits/rejected": -2.389584541320801, + "logps/chosen": -143.44552612304688, + "logps/rejected": -1186.394775390625, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7385128736495972, + "rewards/margins": 10.69105052947998, + "rewards/rejected": -11.429563522338867, + "step": 9610 + }, + { + "epoch": 0.57, + "learning_rate": 2.293906466658837e-06, + "logits/chosen": -2.719064235687256, + "logits/rejected": -2.3804092407226562, + "logps/chosen": -141.65573120117188, + "logps/rejected": -1206.084716796875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6805785894393921, + "rewards/margins": 10.930667877197266, + "rewards/rejected": -11.611247062683105, + "step": 9620 + }, + { + "epoch": 0.57, + "learning_rate": 2.288720904411651e-06, + "logits/chosen": -2.7025928497314453, + "logits/rejected": -2.397294282913208, + "logps/chosen": -156.29571533203125, + "logps/rejected": -1319.673583984375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8888632655143738, + "rewards/margins": 11.861442565917969, + "rewards/rejected": -12.750304222106934, + "step": 9630 + }, + { + "epoch": 0.57, + "learning_rate": 2.283536257551955e-06, + "logits/chosen": -2.6957173347473145, + "logits/rejected": -2.3164567947387695, + "logps/chosen": -174.2127227783203, + "logps/rejected": -1319.6488037109375, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0500398874282837, + "rewards/margins": 11.702432632446289, + "rewards/rejected": -12.752473831176758, + "step": 9640 + }, + { + "epoch": 0.58, + "learning_rate": 2.278352548542744e-06, + "logits/chosen": -2.6827242374420166, + "logits/rejected": -2.344632625579834, + "logps/chosen": -176.06137084960938, + "logps/rejected": -1327.793212890625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0405280590057373, + "rewards/margins": 11.787446975708008, + "rewards/rejected": -12.827974319458008, + "step": 9650 + }, + { + "epoch": 0.58, + "learning_rate": 2.2731697998429485e-06, + "logits/chosen": -2.7394208908081055, + "logits/rejected": -2.39331316947937, + "logps/chosen": -162.4534454345703, + "logps/rejected": -1293.4583740234375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9437228441238403, + "rewards/margins": 11.549233436584473, + "rewards/rejected": -12.492956161499023, + "step": 9660 + }, + { + "epoch": 0.58, + "learning_rate": 2.267988033907335e-06, + "logits/chosen": -2.7272305488586426, + "logits/rejected": -2.384800672531128, + "logps/chosen": -150.8538360595703, + "logps/rejected": -1176.474365234375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7473594546318054, + "rewards/margins": 10.571760177612305, + "rewards/rejected": -11.319120407104492, + "step": 9670 + }, + { + "epoch": 0.58, + "learning_rate": 2.2628072731864186e-06, + "logits/chosen": -2.6871254444122314, + "logits/rejected": -2.3054261207580566, + "logps/chosen": -139.70700073242188, + "logps/rejected": -1352.862060546875, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6979607343673706, + "rewards/margins": 12.369781494140625, + "rewards/rejected": -13.067741394042969, + "step": 9680 + }, + { + "epoch": 0.58, + "learning_rate": 2.257627540126353e-06, + "logits/chosen": -2.7221837043762207, + "logits/rejected": -2.3884360790252686, + "logps/chosen": -177.17596435546875, + "logps/rejected": -1203.695556640625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0383026599884033, + "rewards/margins": 10.544049263000488, + "rewards/rejected": -11.582351684570312, + "step": 9690 + }, + { + "epoch": 0.58, + "learning_rate": 2.2524488571688407e-06, + "logits/chosen": -2.740537643432617, + "logits/rejected": -2.3334858417510986, + "logps/chosen": -147.5407257080078, + "logps/rejected": -1223.970458984375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7409318089485168, + "rewards/margins": 11.06391429901123, + "rewards/rejected": -11.804845809936523, + "step": 9700 + }, + { + "epoch": 0.58, + "learning_rate": 2.247271246751039e-06, + "logits/chosen": -2.6774489879608154, + "logits/rejected": -2.2068538665771484, + "logps/chosen": -143.45022583007812, + "logps/rejected": -1324.55078125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7117710113525391, + "rewards/margins": 12.084896087646484, + "rewards/rejected": -12.796667098999023, + "step": 9710 + }, + { + "epoch": 0.58, + "learning_rate": 2.242094731305452e-06, + "logits/chosen": -2.7038002014160156, + "logits/rejected": -2.322172164916992, + "logps/chosen": -149.22738647460938, + "logps/rejected": -1293.8321533203125, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8422451019287109, + "rewards/margins": 11.669540405273438, + "rewards/rejected": -12.511785507202148, + "step": 9720 + }, + { + "epoch": 0.58, + "learning_rate": 2.236919333259844e-06, + "logits/chosen": -2.710664987564087, + "logits/rejected": -2.3422932624816895, + "logps/chosen": -174.58734130859375, + "logps/rejected": -1267.180419921875, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0178511142730713, + "rewards/margins": 11.215797424316406, + "rewards/rejected": -12.233648300170898, + "step": 9730 + }, + { + "epoch": 0.58, + "learning_rate": 2.231745075037137e-06, + "logits/chosen": -2.7135777473449707, + "logits/rejected": -2.3462371826171875, + "logps/chosen": -158.44654846191406, + "logps/rejected": -1228.8057861328125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8770634531974792, + "rewards/margins": 10.965360641479492, + "rewards/rejected": -11.842424392700195, + "step": 9740 + }, + { + "epoch": 0.58, + "learning_rate": 2.2265719790553147e-06, + "logits/chosen": -2.71870756149292, + "logits/rejected": -2.393075466156006, + "logps/chosen": -142.76229858398438, + "logps/rejected": -1248.034912109375, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7533028721809387, + "rewards/margins": 11.273712158203125, + "rewards/rejected": -12.027015686035156, + "step": 9750 + }, + { + "epoch": 0.58, + "learning_rate": 2.221400067727323e-06, + "logits/chosen": -2.7411274909973145, + "logits/rejected": -2.378117799758911, + "logps/chosen": -148.7209014892578, + "logps/rejected": -1237.346923828125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7532230615615845, + "rewards/margins": 11.163887977600098, + "rewards/rejected": -11.917110443115234, + "step": 9760 + }, + { + "epoch": 0.58, + "learning_rate": 2.21622936346098e-06, + "logits/chosen": -2.7367300987243652, + "logits/rejected": -2.3767924308776855, + "logps/chosen": -133.9344482421875, + "logps/rejected": -1165.0394287109375, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6389613151550293, + "rewards/margins": 10.568251609802246, + "rewards/rejected": -11.207212448120117, + "step": 9770 + }, + { + "epoch": 0.58, + "learning_rate": 2.2110598886588693e-06, + "logits/chosen": -2.705064296722412, + "logits/rejected": -2.3200974464416504, + "logps/chosen": -123.133056640625, + "logps/rejected": -1162.938720703125, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.488416850566864, + "rewards/margins": 10.700271606445312, + "rewards/rejected": -11.188688278198242, + "step": 9780 + }, + { + "epoch": 0.58, + "learning_rate": 2.2058916657182493e-06, + "logits/chosen": -2.7131052017211914, + "logits/rejected": -2.3258042335510254, + "logps/chosen": -119.3879165649414, + "logps/rejected": -1254.6146240234375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5529167056083679, + "rewards/margins": 11.558649063110352, + "rewards/rejected": -12.111566543579102, + "step": 9790 + }, + { + "epoch": 0.58, + "learning_rate": 2.2007247170309567e-06, + "logits/chosen": -2.676908016204834, + "logits/rejected": -2.319890260696411, + "logps/chosen": -120.99007415771484, + "logps/rejected": -1231.625244140625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5064847469329834, + "rewards/margins": 11.367959022521973, + "rewards/rejected": -11.874443054199219, + "step": 9800 + }, + { + "epoch": 0.58, + "learning_rate": 2.195559064983304e-06, + "logits/chosen": -2.7641353607177734, + "logits/rejected": -2.2611196041107178, + "logps/chosen": -107.12109375, + "logps/rejected": -1164.3746337890625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40743571519851685, + "rewards/margins": 10.78770637512207, + "rewards/rejected": -11.195141792297363, + "step": 9810 + }, + { + "epoch": 0.59, + "learning_rate": 2.1903947319559884e-06, + "logits/chosen": -2.672227382659912, + "logits/rejected": -2.273566484451294, + "logps/chosen": -114.71855163574219, + "logps/rejected": -1231.4246826171875, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39840349555015564, + "rewards/margins": 11.47371768951416, + "rewards/rejected": -11.872122764587402, + "step": 9820 + }, + { + "epoch": 0.59, + "learning_rate": 2.1852317403239907e-06, + "logits/chosen": -2.728119134902954, + "logits/rejected": -2.2599117755889893, + "logps/chosen": -103.9937973022461, + "logps/rejected": -1233.3365478515625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34447240829467773, + "rewards/margins": 11.541015625, + "rewards/rejected": -11.885488510131836, + "step": 9830 + }, + { + "epoch": 0.59, + "learning_rate": 2.180070112456482e-06, + "logits/chosen": -2.6759068965911865, + "logits/rejected": -2.2568295001983643, + "logps/chosen": -117.73927307128906, + "logps/rejected": -1246.0224609375, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4468896985054016, + "rewards/margins": 11.570645332336426, + "rewards/rejected": -12.017535209655762, + "step": 9840 + }, + { + "epoch": 0.59, + "learning_rate": 2.174909870716721e-06, + "logits/chosen": -2.692049741744995, + "logits/rejected": -2.3022232055664062, + "logps/chosen": -99.54176330566406, + "logps/rejected": -1176.357421875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2951687276363373, + "rewards/margins": 11.026335716247559, + "rewards/rejected": -11.321505546569824, + "step": 9850 + }, + { + "epoch": 0.59, + "learning_rate": 2.169751037461966e-06, + "logits/chosen": -2.727674961090088, + "logits/rejected": -2.2472915649414062, + "logps/chosen": -102.3802490234375, + "logps/rejected": -1200.757568359375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2733840048313141, + "rewards/margins": 11.29835033416748, + "rewards/rejected": -11.571733474731445, + "step": 9860 + }, + { + "epoch": 0.59, + "learning_rate": 2.1645936350433692e-06, + "logits/chosen": -2.675719738006592, + "logits/rejected": -2.3521580696105957, + "logps/chosen": -122.25919342041016, + "logps/rejected": -1234.517578125, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5146879553794861, + "rewards/margins": 11.389273643493652, + "rewards/rejected": -11.903961181640625, + "step": 9870 + }, + { + "epoch": 0.59, + "learning_rate": 2.159437685805883e-06, + "logits/chosen": -2.710116386413574, + "logits/rejected": -2.3138279914855957, + "logps/chosen": -111.29341125488281, + "logps/rejected": -1215.350341796875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3928848206996918, + "rewards/margins": 11.326349258422852, + "rewards/rejected": -11.71923542022705, + "step": 9880 + }, + { + "epoch": 0.59, + "learning_rate": 2.154283212088168e-06, + "logits/chosen": -2.712231397628784, + "logits/rejected": -2.2694108486175537, + "logps/chosen": -112.117431640625, + "logps/rejected": -1220.748046875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4200320243835449, + "rewards/margins": 11.336292266845703, + "rewards/rejected": -11.756322860717773, + "step": 9890 + }, + { + "epoch": 0.59, + "learning_rate": 2.149130236222487e-06, + "logits/chosen": -2.702179431915283, + "logits/rejected": -2.3258004188537598, + "logps/chosen": -109.50498962402344, + "logps/rejected": -1042.2923583984375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3576398491859436, + "rewards/margins": 9.622787475585938, + "rewards/rejected": -9.980427742004395, + "step": 9900 + }, + { + "epoch": 0.59, + "learning_rate": 2.143978780534616e-06, + "logits/chosen": -2.697065830230713, + "logits/rejected": -2.3736064434051514, + "logps/chosen": -118.19590759277344, + "logps/rejected": -1157.2821044921875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4652016758918762, + "rewards/margins": 10.67738151550293, + "rewards/rejected": -11.142583847045898, + "step": 9910 + }, + { + "epoch": 0.59, + "learning_rate": 2.138828867343746e-06, + "logits/chosen": -2.704310417175293, + "logits/rejected": -2.254875421524048, + "logps/chosen": -121.0136489868164, + "logps/rejected": -1098.206787109375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4516844153404236, + "rewards/margins": 10.086403846740723, + "rewards/rejected": -10.538087844848633, + "step": 9920 + }, + { + "epoch": 0.59, + "learning_rate": 2.1336805189623813e-06, + "logits/chosen": -2.703275680541992, + "logits/rejected": -2.268481731414795, + "logps/chosen": -132.83358764648438, + "logps/rejected": -1127.052490234375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5715658664703369, + "rewards/margins": 10.252817153930664, + "rewards/rejected": -10.824382781982422, + "step": 9930 + }, + { + "epoch": 0.59, + "learning_rate": 2.128533757696248e-06, + "logits/chosen": -2.7110695838928223, + "logits/rejected": -2.2251057624816895, + "logps/chosen": -132.5632781982422, + "logps/rejected": -1197.8358154296875, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6174434423446655, + "rewards/margins": 10.920549392700195, + "rewards/rejected": -11.537992477416992, + "step": 9940 + }, + { + "epoch": 0.59, + "learning_rate": 2.123388605844198e-06, + "logits/chosen": -2.7473742961883545, + "logits/rejected": -2.340789318084717, + "logps/chosen": -113.72737121582031, + "logps/rejected": -1233.451416015625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4353352189064026, + "rewards/margins": 11.462714195251465, + "rewards/rejected": -11.898050308227539, + "step": 9950 + }, + { + "epoch": 0.59, + "learning_rate": 2.1182450856981066e-06, + "logits/chosen": -2.691542148590088, + "logits/rejected": -2.3198623657226562, + "logps/chosen": -134.77635192871094, + "logps/rejected": -1283.7322998046875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6292972564697266, + "rewards/margins": 11.758000373840332, + "rewards/rejected": -12.387296676635742, + "step": 9960 + }, + { + "epoch": 0.59, + "learning_rate": 2.113103219542782e-06, + "logits/chosen": -2.729349136352539, + "logits/rejected": -2.3680644035339355, + "logps/chosen": -128.93963623046875, + "logps/rejected": -1161.668701171875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5358244180679321, + "rewards/margins": 10.645295143127441, + "rewards/rejected": -11.181119918823242, + "step": 9970 + }, + { + "epoch": 0.6, + "learning_rate": 2.107963029655867e-06, + "logits/chosen": -2.654165506362915, + "logits/rejected": -2.2560324668884277, + "logps/chosen": -126.63565826416016, + "logps/rejected": -1294.947509765625, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5407634973526001, + "rewards/margins": 11.96657657623291, + "rewards/rejected": -12.507340431213379, + "step": 9980 + }, + { + "epoch": 0.6, + "learning_rate": 2.1028245383077392e-06, + "logits/chosen": -2.671581983566284, + "logits/rejected": -2.2404465675354004, + "logps/chosen": -130.98965454101562, + "logps/rejected": -1157.130126953125, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6222785711288452, + "rewards/margins": 10.513528823852539, + "rewards/rejected": -11.135807991027832, + "step": 9990 + }, + { + "epoch": 0.6, + "learning_rate": 2.0976877677614183e-06, + "logits/chosen": -2.751434803009033, + "logits/rejected": -2.2885284423828125, + "logps/chosen": -141.47140502929688, + "logps/rejected": -1288.7344970703125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6786977052688599, + "rewards/margins": 11.744922637939453, + "rewards/rejected": -12.423620223999023, + "step": 10000 + }, + { + "epoch": 0.6, + "learning_rate": 2.09255274027247e-06, + "logits/chosen": -2.715000629425049, + "logits/rejected": -2.3316192626953125, + "logps/chosen": -143.27122497558594, + "logps/rejected": -1246.9996337890625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6987273097038269, + "rewards/margins": 11.320338249206543, + "rewards/rejected": -12.019065856933594, + "step": 10010 + }, + { + "epoch": 0.6, + "learning_rate": 2.087419478088906e-06, + "logits/chosen": -2.6851000785827637, + "logits/rejected": -2.3289437294006348, + "logps/chosen": -131.16859436035156, + "logps/rejected": -1238.6689453125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6639537811279297, + "rewards/margins": 11.288864135742188, + "rewards/rejected": -11.9528169631958, + "step": 10020 + }, + { + "epoch": 0.6, + "learning_rate": 2.0822880034510897e-06, + "logits/chosen": -2.6702263355255127, + "logits/rejected": -2.3204562664031982, + "logps/chosen": -151.24459838867188, + "logps/rejected": -1240.671142578125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7986651659011841, + "rewards/margins": 11.178454399108887, + "rewards/rejected": -11.977119445800781, + "step": 10030 + }, + { + "epoch": 0.6, + "learning_rate": 2.077158338591641e-06, + "logits/chosen": -2.722813367843628, + "logits/rejected": -2.3071341514587402, + "logps/chosen": -153.7009735107422, + "logps/rejected": -1224.3828125, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8373913764953613, + "rewards/margins": 10.950312614440918, + "rewards/rejected": -11.787703514099121, + "step": 10040 + }, + { + "epoch": 0.6, + "learning_rate": 2.0720305057353384e-06, + "logits/chosen": -2.7021384239196777, + "logits/rejected": -2.377882242202759, + "logps/chosen": -166.43902587890625, + "logps/rejected": -1259.81396484375, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8993788957595825, + "rewards/margins": 11.235876083374023, + "rewards/rejected": -12.135255813598633, + "step": 10050 + }, + { + "epoch": 0.6, + "learning_rate": 2.0669045270990216e-06, + "logits/chosen": -2.687422752380371, + "logits/rejected": -2.267098903656006, + "logps/chosen": -169.32257080078125, + "logps/rejected": -1229.2327880859375, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0353240966796875, + "rewards/margins": 10.812653541564941, + "rewards/rejected": -11.847977638244629, + "step": 10060 + }, + { + "epoch": 0.6, + "learning_rate": 2.0617804248914992e-06, + "logits/chosen": -2.7357964515686035, + "logits/rejected": -2.3649051189422607, + "logps/chosen": -139.17788696289062, + "logps/rejected": -1189.037353515625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7059356570243835, + "rewards/margins": 10.748687744140625, + "rewards/rejected": -11.454623222351074, + "step": 10070 + }, + { + "epoch": 0.6, + "learning_rate": 2.056658221313449e-06, + "logits/chosen": -2.6646952629089355, + "logits/rejected": -2.3168365955352783, + "logps/chosen": -150.77810668945312, + "logps/rejected": -1313.236083984375, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7881194353103638, + "rewards/margins": 11.894584655761719, + "rewards/rejected": -12.68270492553711, + "step": 10080 + }, + { + "epoch": 0.6, + "learning_rate": 2.0515379385573205e-06, + "logits/chosen": -2.690781831741333, + "logits/rejected": -2.2457027435302734, + "logps/chosen": -160.4130401611328, + "logps/rejected": -1176.741455078125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8455179333686829, + "rewards/margins": 10.480415344238281, + "rewards/rejected": -11.325933456420898, + "step": 10090 + }, + { + "epoch": 0.6, + "learning_rate": 2.0464195988072454e-06, + "logits/chosen": -2.6529228687286377, + "logits/rejected": -2.297720432281494, + "logps/chosen": -173.38729858398438, + "logps/rejected": -1276.4019775390625, + "loss": 0.0351, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0520063638687134, + "rewards/margins": 11.274874687194824, + "rewards/rejected": -12.326879501342773, + "step": 10100 + }, + { + "epoch": 0.6, + "learning_rate": 2.041303224238934e-06, + "logits/chosen": -2.6802988052368164, + "logits/rejected": -2.221172571182251, + "logps/chosen": -143.85702514648438, + "logps/rejected": -1220.127685546875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7228442430496216, + "rewards/margins": 11.044290542602539, + "rewards/rejected": -11.767134666442871, + "step": 10110 + }, + { + "epoch": 0.6, + "learning_rate": 2.036188837019582e-06, + "logits/chosen": -2.6976428031921387, + "logits/rejected": -2.2647860050201416, + "logps/chosen": -148.67123413085938, + "logps/rejected": -1270.944580078125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.786220908164978, + "rewards/margins": 11.47563648223877, + "rewards/rejected": -12.261857032775879, + "step": 10120 + }, + { + "epoch": 0.6, + "learning_rate": 2.031076459307777e-06, + "logits/chosen": -2.704453468322754, + "logits/rejected": -2.2222750186920166, + "logps/chosen": -158.06971740722656, + "logps/rejected": -1266.3482666015625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8916547894477844, + "rewards/margins": 11.33264446258545, + "rewards/rejected": -12.224299430847168, + "step": 10130 + }, + { + "epoch": 0.6, + "learning_rate": 2.0259661132533983e-06, + "logits/chosen": -2.7022762298583984, + "logits/rejected": -2.2719624042510986, + "logps/chosen": -170.3082733154297, + "logps/rejected": -1208.604736328125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9979127645492554, + "rewards/margins": 10.648189544677734, + "rewards/rejected": -11.646102905273438, + "step": 10140 + }, + { + "epoch": 0.61, + "learning_rate": 2.020857820997524e-06, + "logits/chosen": -2.7469606399536133, + "logits/rejected": -2.2794220447540283, + "logps/chosen": -156.23287963867188, + "logps/rejected": -1146.036376953125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8729828596115112, + "rewards/margins": 10.147176742553711, + "rewards/rejected": -11.020161628723145, + "step": 10150 + }, + { + "epoch": 0.61, + "learning_rate": 2.015751604672333e-06, + "logits/chosen": -2.650033712387085, + "logits/rejected": -2.2324442863464355, + "logps/chosen": -175.41061401367188, + "logps/rejected": -1169.69482421875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.044511079788208, + "rewards/margins": 10.20124340057373, + "rewards/rejected": -11.245756149291992, + "step": 10160 + }, + { + "epoch": 0.61, + "learning_rate": 2.010647486401011e-06, + "logits/chosen": -2.741267681121826, + "logits/rejected": -2.277954578399658, + "logps/chosen": -169.172119140625, + "logps/rejected": -1255.72998046875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9312788248062134, + "rewards/margins": 11.190518379211426, + "rewards/rejected": -12.121795654296875, + "step": 10170 + }, + { + "epoch": 0.61, + "learning_rate": 2.005545488297652e-06, + "logits/chosen": -2.6606040000915527, + "logits/rejected": -2.2214267253875732, + "logps/chosen": -159.6976776123047, + "logps/rejected": -1201.581298828125, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.900174617767334, + "rewards/margins": 10.674308776855469, + "rewards/rejected": -11.574483871459961, + "step": 10180 + }, + { + "epoch": 0.61, + "learning_rate": 2.0004456324671673e-06, + "logits/chosen": -2.6660828590393066, + "logits/rejected": -2.1683387756347656, + "logps/chosen": -176.6342010498047, + "logps/rejected": -1268.905517578125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0480890274047852, + "rewards/margins": 11.187259674072266, + "rewards/rejected": -12.235349655151367, + "step": 10190 + }, + { + "epoch": 0.61, + "learning_rate": 1.9953479410051833e-06, + "logits/chosen": -2.725188732147217, + "logits/rejected": -2.3086845874786377, + "logps/chosen": -190.69775390625, + "logps/rejected": -1240.564453125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1602516174316406, + "rewards/margins": 10.803372383117676, + "rewards/rejected": -11.963623046875, + "step": 10200 + }, + { + "epoch": 0.61, + "learning_rate": 1.9902524359979494e-06, + "logits/chosen": -2.7197043895721436, + "logits/rejected": -2.2445409297943115, + "logps/chosen": -166.06114196777344, + "logps/rejected": -1292.9495849609375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9960309863090515, + "rewards/margins": 11.476550102233887, + "rewards/rejected": -12.472580909729004, + "step": 10210 + }, + { + "epoch": 0.61, + "learning_rate": 1.985159139522245e-06, + "logits/chosen": -2.6791765689849854, + "logits/rejected": -2.278733491897583, + "logps/chosen": -186.21365356445312, + "logps/rejected": -1187.319091796875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1369743347167969, + "rewards/margins": 10.308209419250488, + "rewards/rejected": -11.445182800292969, + "step": 10220 + }, + { + "epoch": 0.61, + "learning_rate": 1.9800680736452773e-06, + "logits/chosen": -2.7204699516296387, + "logits/rejected": -2.27595853805542, + "logps/chosen": -171.42391967773438, + "logps/rejected": -1147.065185546875, + "loss": 0.0136, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0061955451965332, + "rewards/margins": 10.023384094238281, + "rewards/rejected": -11.029580116271973, + "step": 10230 + }, + { + "epoch": 0.61, + "learning_rate": 1.974979260424591e-06, + "logits/chosen": -2.7123920917510986, + "logits/rejected": -2.3224048614501953, + "logps/chosen": -150.8501434326172, + "logps/rejected": -1207.732177734375, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7454864978790283, + "rewards/margins": 10.883295059204102, + "rewards/rejected": -11.62878131866455, + "step": 10240 + }, + { + "epoch": 0.61, + "learning_rate": 1.969892721907971e-06, + "logits/chosen": -2.691455364227295, + "logits/rejected": -2.288158655166626, + "logps/chosen": -176.45851135253906, + "logps/rejected": -1143.230224609375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0605857372283936, + "rewards/margins": 9.927177429199219, + "rewards/rejected": -10.987763404846191, + "step": 10250 + }, + { + "epoch": 0.61, + "learning_rate": 1.9648084801333468e-06, + "logits/chosen": -2.6569085121154785, + "logits/rejected": -2.267460584640503, + "logps/chosen": -157.91433715820312, + "logps/rejected": -1155.6077880859375, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9086402058601379, + "rewards/margins": 10.202337265014648, + "rewards/rejected": -11.110978126525879, + "step": 10260 + }, + { + "epoch": 0.61, + "learning_rate": 1.9597265571286945e-06, + "logits/chosen": -2.6799840927124023, + "logits/rejected": -2.2462401390075684, + "logps/chosen": -147.64344787597656, + "logps/rejected": -1218.9774169921875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8373859524726868, + "rewards/margins": 10.907033920288086, + "rewards/rejected": -11.74441909790039, + "step": 10270 + }, + { + "epoch": 0.61, + "learning_rate": 1.9546469749119485e-06, + "logits/chosen": -2.729508876800537, + "logits/rejected": -2.3001186847686768, + "logps/chosen": -151.4470977783203, + "logps/rejected": -1142.6334228515625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7849904298782349, + "rewards/margins": 10.195791244506836, + "rewards/rejected": -10.980780601501465, + "step": 10280 + }, + { + "epoch": 0.61, + "learning_rate": 1.9495697554908984e-06, + "logits/chosen": -2.678361415863037, + "logits/rejected": -2.2465968132019043, + "logps/chosen": -144.54483032226562, + "logps/rejected": -1181.9140625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7672642469406128, + "rewards/margins": 10.611356735229492, + "rewards/rejected": -11.378621101379395, + "step": 10290 + }, + { + "epoch": 0.61, + "learning_rate": 1.944494920863096e-06, + "logits/chosen": -2.686680555343628, + "logits/rejected": -2.2656469345092773, + "logps/chosen": -169.4293212890625, + "logps/rejected": -1171.400634765625, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9755252003669739, + "rewards/margins": 10.304737091064453, + "rewards/rejected": -11.280261993408203, + "step": 10300 + }, + { + "epoch": 0.61, + "learning_rate": 1.939422493015764e-06, + "logits/chosen": -2.6721138954162598, + "logits/rejected": -2.21661639213562, + "logps/chosen": -155.87158203125, + "logps/rejected": -1284.742919921875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8563534617424011, + "rewards/margins": 11.553642272949219, + "rewards/rejected": -12.409995079040527, + "step": 10310 + }, + { + "epoch": 0.62, + "learning_rate": 1.934352493925695e-06, + "logits/chosen": -2.728530168533325, + "logits/rejected": -2.3068325519561768, + "logps/chosen": -174.71905517578125, + "logps/rejected": -1075.154541015625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0647732019424438, + "rewards/margins": 9.243081092834473, + "rewards/rejected": -10.307853698730469, + "step": 10320 + }, + { + "epoch": 0.62, + "learning_rate": 1.929284945559159e-06, + "logits/chosen": -2.7263424396514893, + "logits/rejected": -2.3152506351470947, + "logps/chosen": -187.24583435058594, + "logps/rejected": -1229.1773681640625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0893981456756592, + "rewards/margins": 10.766997337341309, + "rewards/rejected": -11.85639476776123, + "step": 10330 + }, + { + "epoch": 0.62, + "learning_rate": 1.9242198698718096e-06, + "logits/chosen": -2.7002689838409424, + "logits/rejected": -2.285580635070801, + "logps/chosen": -169.35635375976562, + "logps/rejected": -1162.4388427734375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9782946705818176, + "rewards/margins": 10.206392288208008, + "rewards/rejected": -11.184687614440918, + "step": 10340 + }, + { + "epoch": 0.62, + "learning_rate": 1.919157288808585e-06, + "logits/chosen": -2.672635555267334, + "logits/rejected": -2.163297653198242, + "logps/chosen": -173.37655639648438, + "logps/rejected": -1241.8756103515625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.027738332748413, + "rewards/margins": 10.940069198608398, + "rewards/rejected": -11.967806816101074, + "step": 10350 + }, + { + "epoch": 0.62, + "learning_rate": 1.914097224303616e-06, + "logits/chosen": -2.651663303375244, + "logits/rejected": -2.2349636554718018, + "logps/chosen": -167.01730346679688, + "logps/rejected": -1237.832763671875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9664157629013062, + "rewards/margins": 10.976346969604492, + "rewards/rejected": -11.942761421203613, + "step": 10360 + }, + { + "epoch": 0.62, + "learning_rate": 1.9090396982801317e-06, + "logits/chosen": -2.69647479057312, + "logits/rejected": -2.3212196826934814, + "logps/chosen": -156.8590087890625, + "logps/rejected": -1209.521728515625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9029251337051392, + "rewards/margins": 10.743374824523926, + "rewards/rejected": -11.646299362182617, + "step": 10370 + }, + { + "epoch": 0.62, + "learning_rate": 1.9039847326503608e-06, + "logits/chosen": -2.709139585494995, + "logits/rejected": -2.2755322456359863, + "logps/chosen": -181.7281036376953, + "logps/rejected": -1297.624755859375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1287683248519897, + "rewards/margins": 11.405282974243164, + "rewards/rejected": -12.534050941467285, + "step": 10380 + }, + { + "epoch": 0.62, + "learning_rate": 1.8989323493154402e-06, + "logits/chosen": -2.7091329097747803, + "logits/rejected": -2.2651634216308594, + "logps/chosen": -162.07962036132812, + "logps/rejected": -1172.4117431640625, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9179351925849915, + "rewards/margins": 10.364705085754395, + "rewards/rejected": -11.282638549804688, + "step": 10390 + }, + { + "epoch": 0.62, + "learning_rate": 1.893882570165318e-06, + "logits/chosen": -2.674309253692627, + "logits/rejected": -2.3083274364471436, + "logps/chosen": -155.83653259277344, + "logps/rejected": -1306.9290771484375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8740228414535522, + "rewards/margins": 11.746663093566895, + "rewards/rejected": -12.620685577392578, + "step": 10400 + }, + { + "epoch": 0.62, + "learning_rate": 1.8888354170786604e-06, + "logits/chosen": -2.691688299179077, + "logits/rejected": -2.301456928253174, + "logps/chosen": -143.4541473388672, + "logps/rejected": -1127.671630859375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7627164125442505, + "rewards/margins": 10.070082664489746, + "rewards/rejected": -10.83279800415039, + "step": 10410 + }, + { + "epoch": 0.62, + "learning_rate": 1.8837909119227541e-06, + "logits/chosen": -2.655333995819092, + "logits/rejected": -2.339489459991455, + "logps/chosen": -167.1618194580078, + "logps/rejected": -1247.521484375, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9810325503349304, + "rewards/margins": 11.044880867004395, + "rewards/rejected": -12.025912284851074, + "step": 10420 + }, + { + "epoch": 0.62, + "learning_rate": 1.878749076553416e-06, + "logits/chosen": -2.6650567054748535, + "logits/rejected": -2.192473888397217, + "logps/chosen": -159.62437438964844, + "logps/rejected": -1261.5546875, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8769809603691101, + "rewards/margins": 11.283531188964844, + "rewards/rejected": -12.16051197052002, + "step": 10430 + }, + { + "epoch": 0.62, + "learning_rate": 1.873709932814894e-06, + "logits/chosen": -2.648094892501831, + "logits/rejected": -2.242197036743164, + "logps/chosen": -149.56546020507812, + "logps/rejected": -1178.7962646484375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7502764463424683, + "rewards/margins": 10.599845886230469, + "rewards/rejected": -11.350122451782227, + "step": 10440 + }, + { + "epoch": 0.62, + "learning_rate": 1.8686735025397728e-06, + "logits/chosen": -2.6840195655822754, + "logits/rejected": -2.3303234577178955, + "logps/chosen": -149.92015075683594, + "logps/rejected": -1316.4888916015625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7355614304542542, + "rewards/margins": 11.985756874084473, + "rewards/rejected": -12.721318244934082, + "step": 10450 + }, + { + "epoch": 0.62, + "learning_rate": 1.8636398075488857e-06, + "logits/chosen": -2.6736550331115723, + "logits/rejected": -2.2199785709381104, + "logps/chosen": -147.96337890625, + "logps/rejected": -1128.46630859375, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7754791975021362, + "rewards/margins": 10.067721366882324, + "rewards/rejected": -10.84320068359375, + "step": 10460 + }, + { + "epoch": 0.62, + "learning_rate": 1.8586088696512101e-06, + "logits/chosen": -2.6745097637176514, + "logits/rejected": -2.230201005935669, + "logps/chosen": -140.08935546875, + "logps/rejected": -1106.438232421875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6857896447181702, + "rewards/margins": 9.941146850585938, + "rewards/rejected": -10.626935958862305, + "step": 10470 + }, + { + "epoch": 0.62, + "learning_rate": 1.85358071064378e-06, + "logits/chosen": -2.7406771183013916, + "logits/rejected": -2.3733060359954834, + "logps/chosen": -137.88369750976562, + "logps/rejected": -1111.315673828125, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6779087781906128, + "rewards/margins": 10.002992630004883, + "rewards/rejected": -10.680900573730469, + "step": 10480 + }, + { + "epoch": 0.63, + "learning_rate": 1.8485553523115902e-06, + "logits/chosen": -2.6721410751342773, + "logits/rejected": -2.337549924850464, + "logps/chosen": -133.80642700195312, + "logps/rejected": -1201.2855224609375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.638275682926178, + "rewards/margins": 10.92553997039795, + "rewards/rejected": -11.56381607055664, + "step": 10490 + }, + { + "epoch": 0.63, + "learning_rate": 1.8435328164275007e-06, + "logits/chosen": -2.65844464302063, + "logits/rejected": -2.3178467750549316, + "logps/chosen": -158.33909606933594, + "logps/rejected": -1240.73876953125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8368174433708191, + "rewards/margins": 11.120333671569824, + "rewards/rejected": -11.95715045928955, + "step": 10500 + }, + { + "epoch": 0.63, + "learning_rate": 1.838513124752142e-06, + "logits/chosen": -2.704360246658325, + "logits/rejected": -2.271937131881714, + "logps/chosen": -121.60099029541016, + "logps/rejected": -1182.9244384765625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6088675260543823, + "rewards/margins": 10.778694152832031, + "rewards/rejected": -11.387561798095703, + "step": 10510 + }, + { + "epoch": 0.63, + "learning_rate": 1.833496299033824e-06, + "logits/chosen": -2.6805663108825684, + "logits/rejected": -2.2875351905822754, + "logps/chosen": -154.54995727539062, + "logps/rejected": -1141.4454345703125, + "loss": 0.0209, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8170779347419739, + "rewards/margins": 10.14470100402832, + "rewards/rejected": -10.96177864074707, + "step": 10520 + }, + { + "epoch": 0.63, + "learning_rate": 1.8284823610084375e-06, + "logits/chosen": -2.705570697784424, + "logits/rejected": -2.286400318145752, + "logps/chosen": -174.88827514648438, + "logps/rejected": -1088.65234375, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9820972681045532, + "rewards/margins": 9.472587585449219, + "rewards/rejected": -10.454684257507324, + "step": 10530 + }, + { + "epoch": 0.63, + "learning_rate": 1.8234713323993622e-06, + "logits/chosen": -2.7189624309539795, + "logits/rejected": -2.221888780593872, + "logps/chosen": -135.50201416015625, + "logps/rejected": -1256.1312255859375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6958117485046387, + "rewards/margins": 11.418486595153809, + "rewards/rejected": -12.114297866821289, + "step": 10540 + }, + { + "epoch": 0.63, + "learning_rate": 1.8184632349173747e-06, + "logits/chosen": -2.667470693588257, + "logits/rejected": -2.3295605182647705, + "logps/chosen": -145.33932495117188, + "logps/rejected": -1128.9957275390625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.777393102645874, + "rewards/margins": 10.06690502166748, + "rewards/rejected": -10.84429931640625, + "step": 10550 + }, + { + "epoch": 0.63, + "learning_rate": 1.8134580902605491e-06, + "logits/chosen": -2.7305068969726562, + "logits/rejected": -2.313091278076172, + "logps/chosen": -164.78121948242188, + "logps/rejected": -1104.535400390625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9076918363571167, + "rewards/margins": 9.698684692382812, + "rewards/rejected": -10.606376647949219, + "step": 10560 + }, + { + "epoch": 0.63, + "learning_rate": 1.8084559201141677e-06, + "logits/chosen": -2.6300487518310547, + "logits/rejected": -2.2776150703430176, + "logps/chosen": -184.3844451904297, + "logps/rejected": -1363.12744140625, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1176321506500244, + "rewards/margins": 12.056962013244629, + "rewards/rejected": -13.174595832824707, + "step": 10570 + }, + { + "epoch": 0.63, + "learning_rate": 1.803456746150627e-06, + "logits/chosen": -2.6253128051757812, + "logits/rejected": -2.2257564067840576, + "logps/chosen": -136.60572814941406, + "logps/rejected": -1179.6331787109375, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7256103754043579, + "rewards/margins": 10.64277172088623, + "rewards/rejected": -11.368383407592773, + "step": 10580 + }, + { + "epoch": 0.63, + "learning_rate": 1.7984605900293395e-06, + "logits/chosen": -2.6622276306152344, + "logits/rejected": -2.189941167831421, + "logps/chosen": -180.76763916015625, + "logps/rejected": -1323.7637939453125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1611995697021484, + "rewards/margins": 11.631550788879395, + "rewards/rejected": -12.792750358581543, + "step": 10590 + }, + { + "epoch": 0.63, + "learning_rate": 1.7934674733966426e-06, + "logits/chosen": -2.670856475830078, + "logits/rejected": -2.275364637374878, + "logps/chosen": -181.9188995361328, + "logps/rejected": -1288.626708984375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1387431621551514, + "rewards/margins": 11.296748161315918, + "rewards/rejected": -12.435491561889648, + "step": 10600 + }, + { + "epoch": 0.63, + "learning_rate": 1.7884774178857079e-06, + "logits/chosen": -2.6612496376037598, + "logits/rejected": -2.250316619873047, + "logps/chosen": -185.29873657226562, + "logps/rejected": -1283.1878662109375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1791346073150635, + "rewards/margins": 11.206694602966309, + "rewards/rejected": -12.385828018188477, + "step": 10610 + }, + { + "epoch": 0.63, + "learning_rate": 1.7834904451164417e-06, + "logits/chosen": -2.666290044784546, + "logits/rejected": -2.2596793174743652, + "logps/chosen": -172.3092498779297, + "logps/rejected": -1252.9447021484375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0615670680999756, + "rewards/margins": 11.021515846252441, + "rewards/rejected": -12.083083152770996, + "step": 10620 + }, + { + "epoch": 0.63, + "learning_rate": 1.7785065766953932e-06, + "logits/chosen": -2.6500802040100098, + "logits/rejected": -2.146433115005493, + "logps/chosen": -197.70945739746094, + "logps/rejected": -1286.618896484375, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3150241374969482, + "rewards/margins": 11.083675384521484, + "rewards/rejected": -12.398698806762695, + "step": 10630 + }, + { + "epoch": 0.63, + "learning_rate": 1.7735258342156653e-06, + "logits/chosen": -2.6253857612609863, + "logits/rejected": -2.1825995445251465, + "logps/chosen": -210.876708984375, + "logps/rejected": -1185.6953125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4058815240859985, + "rewards/margins": 10.007796287536621, + "rewards/rejected": -11.413679122924805, + "step": 10640 + }, + { + "epoch": 0.64, + "learning_rate": 1.768548239256815e-06, + "logits/chosen": -2.6982548236846924, + "logits/rejected": -2.3529727458953857, + "logps/chosen": -237.3026580810547, + "logps/rejected": -1256.3804931640625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6476304531097412, + "rewards/margins": 10.454874992370605, + "rewards/rejected": -12.102503776550293, + "step": 10650 + }, + { + "epoch": 0.64, + "learning_rate": 1.7635738133847608e-06, + "logits/chosen": -2.713792324066162, + "logits/rejected": -2.2466139793395996, + "logps/chosen": -239.65487670898438, + "logps/rejected": -1288.21142578125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.663100242614746, + "rewards/margins": 10.771455764770508, + "rewards/rejected": -12.434554100036621, + "step": 10660 + }, + { + "epoch": 0.64, + "learning_rate": 1.7586025781516958e-06, + "logits/chosen": -2.6637954711914062, + "logits/rejected": -2.270134687423706, + "logps/chosen": -211.2230224609375, + "logps/rejected": -1232.7806396484375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4256975650787354, + "rewards/margins": 10.447927474975586, + "rewards/rejected": -11.873624801635742, + "step": 10670 + }, + { + "epoch": 0.64, + "learning_rate": 1.7536345550959844e-06, + "logits/chosen": -2.7061476707458496, + "logits/rejected": -2.2232162952423096, + "logps/chosen": -190.14871215820312, + "logps/rejected": -1213.1466064453125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2435824871063232, + "rewards/margins": 10.444583892822266, + "rewards/rejected": -11.688166618347168, + "step": 10680 + }, + { + "epoch": 0.64, + "learning_rate": 1.7486697657420752e-06, + "logits/chosen": -2.711820363998413, + "logits/rejected": -2.3143553733825684, + "logps/chosen": -180.61358642578125, + "logps/rejected": -1146.7406005859375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1420700550079346, + "rewards/margins": 9.889798164367676, + "rewards/rejected": -11.031867980957031, + "step": 10690 + }, + { + "epoch": 0.64, + "learning_rate": 1.743708231600409e-06, + "logits/chosen": -2.706454038619995, + "logits/rejected": -2.3063597679138184, + "logps/chosen": -213.4799346923828, + "logps/rejected": -1232.907470703125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4261481761932373, + "rewards/margins": 10.453648567199707, + "rewards/rejected": -11.879796028137207, + "step": 10700 + }, + { + "epoch": 0.64, + "learning_rate": 1.7387499741673197e-06, + "logits/chosen": -2.655303955078125, + "logits/rejected": -2.2451364994049072, + "logps/chosen": -204.3408660888672, + "logps/rejected": -1256.3521728515625, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3569390773773193, + "rewards/margins": 10.759969711303711, + "rewards/rejected": -12.116909980773926, + "step": 10710 + }, + { + "epoch": 0.64, + "learning_rate": 1.7337950149249466e-06, + "logits/chosen": -2.6937034130096436, + "logits/rejected": -2.2553374767303467, + "logps/chosen": -196.3228302001953, + "logps/rejected": -1234.77197265625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2222070693969727, + "rewards/margins": 10.689011573791504, + "rewards/rejected": -11.911218643188477, + "step": 10720 + }, + { + "epoch": 0.64, + "learning_rate": 1.7288433753411383e-06, + "logits/chosen": -2.7084755897521973, + "logits/rejected": -2.300086736679077, + "logps/chosen": -176.45362854003906, + "logps/rejected": -1240.108642578125, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0833253860473633, + "rewards/margins": 10.869207382202148, + "rewards/rejected": -11.952531814575195, + "step": 10730 + }, + { + "epoch": 0.64, + "learning_rate": 1.7238950768693619e-06, + "logits/chosen": -2.6832168102264404, + "logits/rejected": -2.219399929046631, + "logps/chosen": -190.40277099609375, + "logps/rejected": -1201.741943359375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1526172161102295, + "rewards/margins": 10.423155784606934, + "rewards/rejected": -11.575773239135742, + "step": 10740 + }, + { + "epoch": 0.64, + "learning_rate": 1.7189501409486061e-06, + "logits/chosen": -2.6671907901763916, + "logits/rejected": -2.2211320400238037, + "logps/chosen": -166.03326416015625, + "logps/rejected": -1250.656005859375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9305809736251831, + "rewards/margins": 11.132793426513672, + "rewards/rejected": -12.063374519348145, + "step": 10750 + }, + { + "epoch": 0.64, + "learning_rate": 1.7140085890032951e-06, + "logits/chosen": -2.6868786811828613, + "logits/rejected": -2.216801643371582, + "logps/chosen": -183.50216674804688, + "logps/rejected": -1362.376953125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0438324213027954, + "rewards/margins": 12.142550468444824, + "rewards/rejected": -13.186384201049805, + "step": 10760 + }, + { + "epoch": 0.64, + "learning_rate": 1.7090704424431882e-06, + "logits/chosen": -2.6949551105499268, + "logits/rejected": -2.3042445182800293, + "logps/chosen": -188.57101440429688, + "logps/rejected": -1268.557861328125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1118698120117188, + "rewards/margins": 11.11991024017334, + "rewards/rejected": -12.231779098510742, + "step": 10770 + }, + { + "epoch": 0.64, + "learning_rate": 1.704135722663291e-06, + "logits/chosen": -2.6836369037628174, + "logits/rejected": -2.2245659828186035, + "logps/chosen": -187.42575073242188, + "logps/rejected": -1304.1470947265625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1433792114257812, + "rewards/margins": 11.452189445495605, + "rewards/rejected": -12.59556770324707, + "step": 10780 + }, + { + "epoch": 0.64, + "learning_rate": 1.6992044510437644e-06, + "logits/chosen": -2.696124315261841, + "logits/rejected": -2.2626724243164062, + "logps/chosen": -205.2533721923828, + "logps/rejected": -1335.229248046875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3333510160446167, + "rewards/margins": 11.585688591003418, + "rewards/rejected": -12.919039726257324, + "step": 10790 + }, + { + "epoch": 0.64, + "learning_rate": 1.6942766489498278e-06, + "logits/chosen": -2.7053720951080322, + "logits/rejected": -2.2856245040893555, + "logps/chosen": -164.22140502929688, + "logps/rejected": -1135.5733642578125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9044491052627563, + "rewards/margins": 10.012067794799805, + "rewards/rejected": -10.916516304016113, + "step": 10800 + }, + { + "epoch": 0.64, + "learning_rate": 1.689352337731669e-06, + "logits/chosen": -2.74284029006958, + "logits/rejected": -2.3319687843322754, + "logps/chosen": -194.62136840820312, + "logps/rejected": -1335.9298095703125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.22065007686615, + "rewards/margins": 11.702608108520508, + "rewards/rejected": -12.923257827758789, + "step": 10810 + }, + { + "epoch": 0.65, + "learning_rate": 1.6844315387243514e-06, + "logits/chosen": -2.722900629043579, + "logits/rejected": -2.204911708831787, + "logps/chosen": -174.6101531982422, + "logps/rejected": -1229.6229248046875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.063889741897583, + "rewards/margins": 10.79820442199707, + "rewards/rejected": -11.862093925476074, + "step": 10820 + }, + { + "epoch": 0.65, + "learning_rate": 1.6795142732477222e-06, + "logits/chosen": -2.628021478652954, + "logits/rejected": -2.2600715160369873, + "logps/chosen": -178.87042236328125, + "logps/rejected": -1090.664306640625, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0961132049560547, + "rewards/margins": 9.353715896606445, + "rewards/rejected": -10.4498291015625, + "step": 10830 + }, + { + "epoch": 0.65, + "learning_rate": 1.6746005626063163e-06, + "logits/chosen": -2.688551425933838, + "logits/rejected": -2.259103775024414, + "logps/chosen": -154.6709747314453, + "logps/rejected": -1074.783447265625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8224309086799622, + "rewards/margins": 9.493600845336914, + "rewards/rejected": -10.316032409667969, + "step": 10840 + }, + { + "epoch": 0.65, + "learning_rate": 1.6696904280892716e-06, + "logits/chosen": -2.656571626663208, + "logits/rejected": -2.209709644317627, + "logps/chosen": -168.6212615966797, + "logps/rejected": -1336.944091796875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9901341199874878, + "rewards/margins": 11.925986289978027, + "rewards/rejected": -12.916119575500488, + "step": 10850 + }, + { + "epoch": 0.65, + "learning_rate": 1.6647838909702287e-06, + "logits/chosen": -2.6694931983947754, + "logits/rejected": -2.2432913780212402, + "logps/chosen": -177.16159057617188, + "logps/rejected": -1245.0076904296875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1048028469085693, + "rewards/margins": 10.907476425170898, + "rewards/rejected": -12.012280464172363, + "step": 10860 + }, + { + "epoch": 0.65, + "learning_rate": 1.6598809725072412e-06, + "logits/chosen": -2.7171568870544434, + "logits/rejected": -2.241743326187134, + "logps/chosen": -158.462158203125, + "logps/rejected": -1329.7509765625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9434038400650024, + "rewards/margins": 11.908381462097168, + "rewards/rejected": -12.851785659790039, + "step": 10870 + }, + { + "epoch": 0.65, + "learning_rate": 1.6549816939426888e-06, + "logits/chosen": -2.672494411468506, + "logits/rejected": -2.3112988471984863, + "logps/chosen": -157.9896697998047, + "logps/rejected": -1208.8345947265625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.93427574634552, + "rewards/margins": 10.699585914611816, + "rewards/rejected": -11.633859634399414, + "step": 10880 + }, + { + "epoch": 0.65, + "learning_rate": 1.6500860765031767e-06, + "logits/chosen": -2.662343740463257, + "logits/rejected": -2.240168809890747, + "logps/chosen": -181.88897705078125, + "logps/rejected": -1245.9794921875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0778279304504395, + "rewards/margins": 10.941686630249023, + "rewards/rejected": -12.019515037536621, + "step": 10890 + }, + { + "epoch": 0.65, + "learning_rate": 1.64519414139945e-06, + "logits/chosen": -2.6769790649414062, + "logits/rejected": -2.2713465690612793, + "logps/chosen": -161.05191040039062, + "logps/rejected": -1284.2474365234375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9550189971923828, + "rewards/margins": 11.451133728027344, + "rewards/rejected": -12.406153678894043, + "step": 10900 + }, + { + "epoch": 0.65, + "learning_rate": 1.6403059098263003e-06, + "logits/chosen": -2.7319157123565674, + "logits/rejected": -2.285252094268799, + "logps/chosen": -167.67991638183594, + "logps/rejected": -1321.8765869140625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9087414741516113, + "rewards/margins": 11.856451034545898, + "rewards/rejected": -12.765192031860352, + "step": 10910 + }, + { + "epoch": 0.65, + "learning_rate": 1.6354214029624719e-06, + "logits/chosen": -2.6906540393829346, + "logits/rejected": -2.2802863121032715, + "logps/chosen": -169.17660522460938, + "logps/rejected": -1169.43896484375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9710705876350403, + "rewards/margins": 10.287484169006348, + "rewards/rejected": -11.25855541229248, + "step": 10920 + }, + { + "epoch": 0.65, + "learning_rate": 1.6305406419705704e-06, + "logits/chosen": -2.670264482498169, + "logits/rejected": -2.274122714996338, + "logps/chosen": -193.5460205078125, + "logps/rejected": -1333.782958984375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.228770136833191, + "rewards/margins": 11.671228408813477, + "rewards/rejected": -12.899998664855957, + "step": 10930 + }, + { + "epoch": 0.65, + "learning_rate": 1.6256636479969757e-06, + "logits/chosen": -2.7037606239318848, + "logits/rejected": -2.2911012172698975, + "logps/chosen": -172.2279510498047, + "logps/rejected": -1270.383056640625, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0514845848083496, + "rewards/margins": 11.207279205322266, + "rewards/rejected": -12.258763313293457, + "step": 10940 + }, + { + "epoch": 0.65, + "learning_rate": 1.6207904421717438e-06, + "logits/chosen": -2.677539587020874, + "logits/rejected": -2.2273712158203125, + "logps/chosen": -157.8165740966797, + "logps/rejected": -1257.1446533203125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8738549947738647, + "rewards/margins": 11.240236282348633, + "rewards/rejected": -12.114090919494629, + "step": 10950 + }, + { + "epoch": 0.65, + "learning_rate": 1.6159210456085179e-06, + "logits/chosen": -2.677459239959717, + "logits/rejected": -2.212089776992798, + "logps/chosen": -160.33921813964844, + "logps/rejected": -1302.200439453125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.864704966545105, + "rewards/margins": 11.718900680541992, + "rewards/rejected": -12.583606719970703, + "step": 10960 + }, + { + "epoch": 0.65, + "learning_rate": 1.6110554794044397e-06, + "logits/chosen": -2.66448974609375, + "logits/rejected": -2.289799928665161, + "logps/chosen": -169.28921508789062, + "logps/rejected": -1146.988525390625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9300249218940735, + "rewards/margins": 10.09156608581543, + "rewards/rejected": -11.021590232849121, + "step": 10970 + }, + { + "epoch": 0.65, + "learning_rate": 1.6061937646400526e-06, + "logits/chosen": -2.707273006439209, + "logits/rejected": -2.308811902999878, + "logps/chosen": -152.71749877929688, + "logps/rejected": -1228.77587890625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7697857618331909, + "rewards/margins": 11.076682090759277, + "rewards/rejected": -11.846467971801758, + "step": 10980 + }, + { + "epoch": 0.66, + "learning_rate": 1.6013359223792155e-06, + "logits/chosen": -2.7099242210388184, + "logits/rejected": -2.2905263900756836, + "logps/chosen": -173.2109832763672, + "logps/rejected": -1207.319580078125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0260493755340576, + "rewards/margins": 10.619953155517578, + "rewards/rejected": -11.646002769470215, + "step": 10990 + }, + { + "epoch": 0.66, + "learning_rate": 1.596481973669009e-06, + "logits/chosen": -2.6645987033843994, + "logits/rejected": -2.262202501296997, + "logps/chosen": -164.31773376464844, + "logps/rejected": -1315.1573486328125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9622052311897278, + "rewards/margins": 11.734614372253418, + "rewards/rejected": -12.696820259094238, + "step": 11000 + }, + { + "epoch": 0.66, + "learning_rate": 1.591631939539644e-06, + "logits/chosen": -2.686890125274658, + "logits/rejected": -2.299267292022705, + "logps/chosen": -164.4632568359375, + "logps/rejected": -1337.7044677734375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9094980359077454, + "rewards/margins": 12.016288757324219, + "rewards/rejected": -12.925786972045898, + "step": 11010 + }, + { + "epoch": 0.66, + "learning_rate": 1.5867858410043688e-06, + "logits/chosen": -2.666682481765747, + "logits/rejected": -2.290273666381836, + "logps/chosen": -187.35986328125, + "logps/rejected": -1242.795654296875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1542308330535889, + "rewards/margins": 10.819709777832031, + "rewards/rejected": -11.9739408493042, + "step": 11020 + }, + { + "epoch": 0.66, + "learning_rate": 1.5819436990593855e-06, + "logits/chosen": -2.6902308464050293, + "logits/rejected": -2.2475180625915527, + "logps/chosen": -210.10440063476562, + "logps/rejected": -1348.645263671875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3616456985473633, + "rewards/margins": 11.679061889648438, + "rewards/rejected": -13.0407075881958, + "step": 11030 + }, + { + "epoch": 0.66, + "learning_rate": 1.5771055346837498e-06, + "logits/chosen": -2.650068521499634, + "logits/rejected": -2.249622106552124, + "logps/chosen": -209.57089233398438, + "logps/rejected": -1287.388671875, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3396633863449097, + "rewards/margins": 11.081099510192871, + "rewards/rejected": -12.42076301574707, + "step": 11040 + }, + { + "epoch": 0.66, + "learning_rate": 1.5722713688392844e-06, + "logits/chosen": -2.6901421546936035, + "logits/rejected": -2.28308367729187, + "logps/chosen": -155.38592529296875, + "logps/rejected": -1393.3990478515625, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8177770376205444, + "rewards/margins": 12.665241241455078, + "rewards/rejected": -13.48301887512207, + "step": 11050 + }, + { + "epoch": 0.66, + "learning_rate": 1.5674412224704902e-06, + "logits/chosen": -2.6922764778137207, + "logits/rejected": -2.2809135913848877, + "logps/chosen": -173.3521270751953, + "logps/rejected": -1145.6488037109375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.020254373550415, + "rewards/margins": 10.00926399230957, + "rewards/rejected": -11.029520034790039, + "step": 11060 + }, + { + "epoch": 0.66, + "learning_rate": 1.5626151165044522e-06, + "logits/chosen": -2.7068779468536377, + "logits/rejected": -2.1955337524414062, + "logps/chosen": -161.06285095214844, + "logps/rejected": -1270.3131103515625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8800745010375977, + "rewards/margins": 11.366345405578613, + "rewards/rejected": -12.246420860290527, + "step": 11070 + }, + { + "epoch": 0.66, + "learning_rate": 1.557793071850749e-06, + "logits/chosen": -2.6897425651550293, + "logits/rejected": -2.2221031188964844, + "logps/chosen": -157.92539978027344, + "logps/rejected": -1311.1961669921875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8567537069320679, + "rewards/margins": 11.795483589172363, + "rewards/rejected": -12.652236938476562, + "step": 11080 + }, + { + "epoch": 0.66, + "learning_rate": 1.552975109401365e-06, + "logits/chosen": -2.7281808853149414, + "logits/rejected": -2.3182520866394043, + "logps/chosen": -187.7662353515625, + "logps/rejected": -1216.343017578125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1299870014190674, + "rewards/margins": 10.582721710205078, + "rewards/rejected": -11.712708473205566, + "step": 11090 + }, + { + "epoch": 0.66, + "learning_rate": 1.5481612500305964e-06, + "logits/chosen": -2.686325788497925, + "logits/rejected": -2.291994094848633, + "logps/chosen": -174.48562622070312, + "logps/rejected": -1247.1187744140625, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0441081523895264, + "rewards/margins": 10.992681503295898, + "rewards/rejected": -12.03679084777832, + "step": 11100 + }, + { + "epoch": 0.66, + "learning_rate": 1.5433515145949636e-06, + "logits/chosen": -2.6949706077575684, + "logits/rejected": -2.3253660202026367, + "logps/chosen": -173.86875915527344, + "logps/rejected": -1259.810546875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9885636568069458, + "rewards/margins": 11.173116683959961, + "rewards/rejected": -12.161680221557617, + "step": 11110 + }, + { + "epoch": 0.66, + "learning_rate": 1.5385459239331173e-06, + "logits/chosen": -2.6652445793151855, + "logits/rejected": -2.2798759937286377, + "logps/chosen": -177.71279907226562, + "logps/rejected": -1245.261474609375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0771000385284424, + "rewards/margins": 10.931848526000977, + "rewards/rejected": -12.008947372436523, + "step": 11120 + }, + { + "epoch": 0.66, + "learning_rate": 1.5337444988657546e-06, + "logits/chosen": -2.7376301288604736, + "logits/rejected": -2.2805728912353516, + "logps/chosen": -182.17759704589844, + "logps/rejected": -1372.2288818359375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0855470895767212, + "rewards/margins": 12.18674373626709, + "rewards/rejected": -13.27229118347168, + "step": 11130 + }, + { + "epoch": 0.66, + "learning_rate": 1.5289472601955219e-06, + "logits/chosen": -2.6978373527526855, + "logits/rejected": -2.275664806365967, + "logps/chosen": -192.24615478515625, + "logps/rejected": -1245.9857177734375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2008732557296753, + "rewards/margins": 10.808465957641602, + "rewards/rejected": -12.009337425231934, + "step": 11140 + }, + { + "epoch": 0.66, + "learning_rate": 1.5241542287069273e-06, + "logits/chosen": -2.6892693042755127, + "logits/rejected": -2.257542848587036, + "logps/chosen": -171.97042846679688, + "logps/rejected": -1199.820556640625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0330320596694946, + "rewards/margins": 10.519083023071289, + "rewards/rejected": -11.552114486694336, + "step": 11150 + }, + { + "epoch": 0.67, + "learning_rate": 1.5193654251662531e-06, + "logits/chosen": -2.651379346847534, + "logits/rejected": -2.211075782775879, + "logps/chosen": -202.85946655273438, + "logps/rejected": -1235.321533203125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2954750061035156, + "rewards/margins": 10.60512924194336, + "rewards/rejected": -11.900605201721191, + "step": 11160 + }, + { + "epoch": 0.67, + "learning_rate": 1.514580870321462e-06, + "logits/chosen": -2.700159788131714, + "logits/rejected": -2.2890915870666504, + "logps/chosen": -172.19615173339844, + "logps/rejected": -1392.2249755859375, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9928817749023438, + "rewards/margins": 12.490656852722168, + "rewards/rejected": -13.483538627624512, + "step": 11170 + }, + { + "epoch": 0.67, + "learning_rate": 1.509800584902108e-06, + "logits/chosen": -2.695985794067383, + "logits/rejected": -2.2855312824249268, + "logps/chosen": -181.10806274414062, + "logps/rejected": -1326.548095703125, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0607606172561646, + "rewards/margins": 11.766735076904297, + "rewards/rejected": -12.827496528625488, + "step": 11180 + }, + { + "epoch": 0.67, + "learning_rate": 1.5050245896192503e-06, + "logits/chosen": -2.6450653076171875, + "logits/rejected": -2.217040538787842, + "logps/chosen": -215.6058349609375, + "logps/rejected": -1201.48681640625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4160423278808594, + "rewards/margins": 10.149759292602539, + "rewards/rejected": -11.565801620483398, + "step": 11190 + }, + { + "epoch": 0.67, + "learning_rate": 1.5002529051653576e-06, + "logits/chosen": -2.626765727996826, + "logits/rejected": -2.264268398284912, + "logps/chosen": -171.37767028808594, + "logps/rejected": -1319.353271484375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9757068753242493, + "rewards/margins": 11.780801773071289, + "rewards/rejected": -12.756509780883789, + "step": 11200 + }, + { + "epoch": 0.67, + "learning_rate": 1.4954855522142225e-06, + "logits/chosen": -2.6638448238372803, + "logits/rejected": -2.219672918319702, + "logps/chosen": -179.73074340820312, + "logps/rejected": -1286.843505859375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1611230373382568, + "rewards/margins": 11.27320671081543, + "rewards/rejected": -12.434330940246582, + "step": 11210 + }, + { + "epoch": 0.67, + "learning_rate": 1.4907225514208724e-06, + "logits/chosen": -2.7109882831573486, + "logits/rejected": -2.2481467723846436, + "logps/chosen": -188.31228637695312, + "logps/rejected": -1226.2203369140625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1562902927398682, + "rewards/margins": 10.656560897827148, + "rewards/rejected": -11.812849044799805, + "step": 11220 + }, + { + "epoch": 0.67, + "learning_rate": 1.4859639234214774e-06, + "logits/chosen": -2.730440855026245, + "logits/rejected": -2.3092398643493652, + "logps/chosen": -197.57882690429688, + "logps/rejected": -1378.6351318359375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.256392478942871, + "rewards/margins": 12.091400146484375, + "rewards/rejected": -13.347793579101562, + "step": 11230 + }, + { + "epoch": 0.67, + "learning_rate": 1.48120968883326e-06, + "logits/chosen": -2.6735386848449707, + "logits/rejected": -2.206005334854126, + "logps/chosen": -208.0272216796875, + "logps/rejected": -1233.26025390625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3230682611465454, + "rewards/margins": 10.568212509155273, + "rewards/rejected": -11.891278266906738, + "step": 11240 + }, + { + "epoch": 0.67, + "learning_rate": 1.4764598682544124e-06, + "logits/chosen": -2.659745216369629, + "logits/rejected": -2.2526659965515137, + "logps/chosen": -180.5372314453125, + "logps/rejected": -1184.196044921875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1693991422653198, + "rewards/margins": 10.236166000366211, + "rewards/rejected": -11.405566215515137, + "step": 11250 + }, + { + "epoch": 0.67, + "learning_rate": 1.4717144822639988e-06, + "logits/chosen": -2.6884684562683105, + "logits/rejected": -2.194739580154419, + "logps/chosen": -201.0059814453125, + "logps/rejected": -1283.117431640625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3106249570846558, + "rewards/margins": 11.076963424682617, + "rewards/rejected": -12.38758659362793, + "step": 11260 + }, + { + "epoch": 0.67, + "learning_rate": 1.4669735514218709e-06, + "logits/chosen": -2.6820919513702393, + "logits/rejected": -2.2434215545654297, + "logps/chosen": -207.5338134765625, + "logps/rejected": -1305.942138671875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2980427742004395, + "rewards/margins": 11.307580947875977, + "rewards/rejected": -12.605623245239258, + "step": 11270 + }, + { + "epoch": 0.67, + "learning_rate": 1.46223709626858e-06, + "logits/chosen": -2.6448042392730713, + "logits/rejected": -2.1442885398864746, + "logps/chosen": -178.77737426757812, + "logps/rejected": -1428.279541015625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.014285922050476, + "rewards/margins": 12.814753532409668, + "rewards/rejected": -13.829038619995117, + "step": 11280 + }, + { + "epoch": 0.67, + "learning_rate": 1.457505137325283e-06, + "logits/chosen": -2.68040132522583, + "logits/rejected": -2.30275297164917, + "logps/chosen": -198.19296264648438, + "logps/rejected": -1273.473876953125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2826688289642334, + "rewards/margins": 11.02665901184082, + "rewards/rejected": -12.309328079223633, + "step": 11290 + }, + { + "epoch": 0.67, + "learning_rate": 1.452777695093659e-06, + "logits/chosen": -2.675504207611084, + "logits/rejected": -2.229268789291382, + "logps/chosen": -198.2040252685547, + "logps/rejected": -1304.1357421875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2921191453933716, + "rewards/margins": 11.297712326049805, + "rewards/rejected": -12.589831352233887, + "step": 11300 + }, + { + "epoch": 0.67, + "learning_rate": 1.448054790055817e-06, + "logits/chosen": -2.6589934825897217, + "logits/rejected": -2.1987435817718506, + "logps/chosen": -233.8262481689453, + "logps/rejected": -1269.7509765625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.645715355873108, + "rewards/margins": 10.614938735961914, + "rewards/rejected": -12.260655403137207, + "step": 11310 + }, + { + "epoch": 0.68, + "learning_rate": 1.443336442674208e-06, + "logits/chosen": -2.6648573875427246, + "logits/rejected": -2.232083797454834, + "logps/chosen": -190.82818603515625, + "logps/rejected": -1339.959716796875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1801660060882568, + "rewards/margins": 11.772756576538086, + "rewards/rejected": -12.952923774719238, + "step": 11320 + }, + { + "epoch": 0.68, + "learning_rate": 1.438622673391537e-06, + "logits/chosen": -2.639448881149292, + "logits/rejected": -2.1928868293762207, + "logps/chosen": -209.60018920898438, + "logps/rejected": -1266.0494384765625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4015882015228271, + "rewards/margins": 10.827807426452637, + "rewards/rejected": -12.229395866394043, + "step": 11330 + }, + { + "epoch": 0.68, + "learning_rate": 1.4339135026306738e-06, + "logits/chosen": -2.656280994415283, + "logits/rejected": -2.2428205013275146, + "logps/chosen": -206.0610809326172, + "logps/rejected": -1218.919189453125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3237788677215576, + "rewards/margins": 10.418710708618164, + "rewards/rejected": -11.7424898147583, + "step": 11340 + }, + { + "epoch": 0.68, + "learning_rate": 1.4292089507945655e-06, + "logits/chosen": -2.6868033409118652, + "logits/rejected": -2.2751266956329346, + "logps/chosen": -196.91854858398438, + "logps/rejected": -1246.6492919921875, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2253191471099854, + "rewards/margins": 10.809341430664062, + "rewards/rejected": -12.034660339355469, + "step": 11350 + }, + { + "epoch": 0.68, + "learning_rate": 1.424509038266143e-06, + "logits/chosen": -2.6107964515686035, + "logits/rejected": -2.265533685684204, + "logps/chosen": -184.57809448242188, + "logps/rejected": -1329.9722900390625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.164617657661438, + "rewards/margins": 11.694311141967773, + "rewards/rejected": -12.858929634094238, + "step": 11360 + }, + { + "epoch": 0.68, + "learning_rate": 1.4198137854082443e-06, + "logits/chosen": -2.7021453380584717, + "logits/rejected": -2.2715065479278564, + "logps/chosen": -210.5146026611328, + "logps/rejected": -1284.7017822265625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3556597232818604, + "rewards/margins": 11.051874160766602, + "rewards/rejected": -12.407533645629883, + "step": 11370 + }, + { + "epoch": 0.68, + "learning_rate": 1.4151232125635123e-06, + "logits/chosen": -2.7190427780151367, + "logits/rejected": -2.270233154296875, + "logps/chosen": -188.6830596923828, + "logps/rejected": -1210.3553466796875, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1461411714553833, + "rewards/margins": 10.506145477294922, + "rewards/rejected": -11.652286529541016, + "step": 11380 + }, + { + "epoch": 0.68, + "learning_rate": 1.4104373400543162e-06, + "logits/chosen": -2.725419044494629, + "logits/rejected": -2.337550163269043, + "logps/chosen": -198.15994262695312, + "logps/rejected": -1274.6337890625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.297153115272522, + "rewards/margins": 10.995423316955566, + "rewards/rejected": -12.29257583618164, + "step": 11390 + }, + { + "epoch": 0.68, + "learning_rate": 1.405756188182661e-06, + "logits/chosen": -2.7237181663513184, + "logits/rejected": -2.2582039833068848, + "logps/chosen": -203.4135284423828, + "logps/rejected": -1403.0826416015625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3156065940856934, + "rewards/margins": 12.275812149047852, + "rewards/rejected": -13.59142017364502, + "step": 11400 + }, + { + "epoch": 0.68, + "learning_rate": 1.4010797772300972e-06, + "logits/chosen": -2.6992669105529785, + "logits/rejected": -2.2757856845855713, + "logps/chosen": -180.81820678710938, + "logps/rejected": -1240.244384765625, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1167184114456177, + "rewards/margins": 10.83329963684082, + "rewards/rejected": -11.950017929077148, + "step": 11410 + }, + { + "epoch": 0.68, + "learning_rate": 1.396408127457637e-06, + "logits/chosen": -2.696956157684326, + "logits/rejected": -2.2336435317993164, + "logps/chosen": -185.38070678710938, + "logps/rejected": -1322.981201171875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.144913673400879, + "rewards/margins": 11.636453628540039, + "rewards/rejected": -12.781367301940918, + "step": 11420 + }, + { + "epoch": 0.68, + "learning_rate": 1.3917412591056623e-06, + "logits/chosen": -2.71138596534729, + "logits/rejected": -2.244676113128662, + "logps/chosen": -190.50401306152344, + "logps/rejected": -1354.371826171875, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1792914867401123, + "rewards/margins": 11.913422584533691, + "rewards/rejected": -13.0927152633667, + "step": 11430 + }, + { + "epoch": 0.68, + "learning_rate": 1.3870791923938408e-06, + "logits/chosen": -2.699059009552002, + "logits/rejected": -2.295976161956787, + "logps/chosen": -206.33114624023438, + "logps/rejected": -1343.3280029296875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3789557218551636, + "rewards/margins": 11.613662719726562, + "rewards/rejected": -12.992619514465332, + "step": 11440 + }, + { + "epoch": 0.68, + "learning_rate": 1.3824219475210337e-06, + "logits/chosen": -2.6734960079193115, + "logits/rejected": -2.2320971488952637, + "logps/chosen": -213.85781860351562, + "logps/rejected": -1303.5487060546875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4375171661376953, + "rewards/margins": 11.161863327026367, + "rewards/rejected": -12.599379539489746, + "step": 11450 + }, + { + "epoch": 0.68, + "learning_rate": 1.3777695446652167e-06, + "logits/chosen": -2.717869758605957, + "logits/rejected": -2.301353931427002, + "logps/chosen": -196.81044006347656, + "logps/rejected": -1369.522216796875, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2375197410583496, + "rewards/margins": 12.009973526000977, + "rewards/rejected": -13.2474946975708, + "step": 11460 + }, + { + "epoch": 0.68, + "learning_rate": 1.3731220039833798e-06, + "logits/chosen": -2.6929969787597656, + "logits/rejected": -2.3108010292053223, + "logps/chosen": -204.46914672851562, + "logps/rejected": -1271.9449462890625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.300736665725708, + "rewards/margins": 10.98784065246582, + "rewards/rejected": -12.288576126098633, + "step": 11470 + }, + { + "epoch": 0.68, + "learning_rate": 1.3684793456114526e-06, + "logits/chosen": -2.7030506134033203, + "logits/rejected": -2.258239269256592, + "logps/chosen": -202.55868530273438, + "logps/rejected": -1337.636962890625, + "loss": 0.0226, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3218635320663452, + "rewards/margins": 11.600619316101074, + "rewards/rejected": -12.922483444213867, + "step": 11480 + }, + { + "epoch": 0.69, + "learning_rate": 1.3638415896642093e-06, + "logits/chosen": -2.6755852699279785, + "logits/rejected": -2.2529449462890625, + "logps/chosen": -218.40133666992188, + "logps/rejected": -1291.953369140625, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.488645315170288, + "rewards/margins": 10.97671127319336, + "rewards/rejected": -12.465356826782227, + "step": 11490 + }, + { + "epoch": 0.69, + "learning_rate": 1.359208756235184e-06, + "logits/chosen": -2.717275619506836, + "logits/rejected": -2.291421413421631, + "logps/chosen": -183.0303497314453, + "logps/rejected": -1367.19482421875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109860897064209, + "rewards/margins": 12.122903823852539, + "rewards/rejected": -13.232765197753906, + "step": 11500 + }, + { + "epoch": 0.69, + "learning_rate": 1.3545808653965847e-06, + "logits/chosen": -2.671726703643799, + "logits/rejected": -2.3693349361419678, + "logps/chosen": -207.83706665039062, + "logps/rejected": -1266.651123046875, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3972653150558472, + "rewards/margins": 10.807889938354492, + "rewards/rejected": -12.205156326293945, + "step": 11510 + }, + { + "epoch": 0.69, + "learning_rate": 1.349957937199204e-06, + "logits/chosen": -2.699586868286133, + "logits/rejected": -2.343557357788086, + "logps/chosen": -197.34967041015625, + "logps/rejected": -1186.7872314453125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2625820636749268, + "rewards/margins": 10.151843070983887, + "rewards/rejected": -11.414424896240234, + "step": 11520 + }, + { + "epoch": 0.69, + "learning_rate": 1.3453399916723343e-06, + "logits/chosen": -2.6766464710235596, + "logits/rejected": -2.144011974334717, + "logps/chosen": -180.8981475830078, + "logps/rejected": -1183.130126953125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1016746759414673, + "rewards/margins": 10.29432201385498, + "rewards/rejected": -11.395998001098633, + "step": 11530 + }, + { + "epoch": 0.69, + "learning_rate": 1.3407270488236769e-06, + "logits/chosen": -2.724325180053711, + "logits/rejected": -2.2858035564422607, + "logps/chosen": -197.6632843017578, + "logps/rejected": -1284.21435546875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2183350324630737, + "rewards/margins": 11.170289993286133, + "rewards/rejected": -12.388626098632812, + "step": 11540 + }, + { + "epoch": 0.69, + "learning_rate": 1.3361191286392644e-06, + "logits/chosen": -2.7123208045959473, + "logits/rejected": -2.321998119354248, + "logps/chosen": -211.4967498779297, + "logps/rejected": -1219.271728515625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4731355905532837, + "rewards/margins": 10.281391143798828, + "rewards/rejected": -11.754526138305664, + "step": 11550 + }, + { + "epoch": 0.69, + "learning_rate": 1.3315162510833623e-06, + "logits/chosen": -2.707472324371338, + "logits/rejected": -2.3292603492736816, + "logps/chosen": -189.14869689941406, + "logps/rejected": -1366.7987060546875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2080543041229248, + "rewards/margins": 12.017338752746582, + "rewards/rejected": -13.225393295288086, + "step": 11560 + }, + { + "epoch": 0.69, + "learning_rate": 1.3269184360983919e-06, + "logits/chosen": -2.6877353191375732, + "logits/rejected": -2.286729574203491, + "logps/chosen": -219.62985229492188, + "logps/rejected": -1284.7747802734375, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.499387502670288, + "rewards/margins": 10.899397850036621, + "rewards/rejected": -12.398786544799805, + "step": 11570 + }, + { + "epoch": 0.69, + "learning_rate": 1.3223257036048395e-06, + "logits/chosen": -2.7114851474761963, + "logits/rejected": -2.2053208351135254, + "logps/chosen": -176.20571899414062, + "logps/rejected": -1250.053955078125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9813610911369324, + "rewards/margins": 11.068108558654785, + "rewards/rejected": -12.049469947814941, + "step": 11580 + }, + { + "epoch": 0.69, + "learning_rate": 1.3177380735011714e-06, + "logits/chosen": -2.6327686309814453, + "logits/rejected": -2.1458797454833984, + "logps/chosen": -182.25143432617188, + "logps/rejected": -1309.0238037109375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0889462232589722, + "rewards/margins": 11.559961318969727, + "rewards/rejected": -12.648905754089355, + "step": 11590 + }, + { + "epoch": 0.69, + "learning_rate": 1.3131555656637459e-06, + "logits/chosen": -2.6530749797821045, + "logits/rejected": -2.1939406394958496, + "logps/chosen": -182.24398803710938, + "logps/rejected": -1201.700927734375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.08091139793396, + "rewards/margins": 10.48482894897461, + "rewards/rejected": -11.565740585327148, + "step": 11600 + }, + { + "epoch": 0.69, + "learning_rate": 1.3085781999467303e-06, + "logits/chosen": -2.6596662998199463, + "logits/rejected": -2.253260374069214, + "logps/chosen": -164.27633666992188, + "logps/rejected": -1256.12744140625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8979629278182983, + "rewards/margins": 11.229085922241211, + "rewards/rejected": -12.127047538757324, + "step": 11610 + }, + { + "epoch": 0.69, + "learning_rate": 1.3040059961820135e-06, + "logits/chosen": -2.669900417327881, + "logits/rejected": -2.29205584526062, + "logps/chosen": -144.73570251464844, + "logps/rejected": -1288.7686767578125, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7337912321090698, + "rewards/margins": 11.727774620056152, + "rewards/rejected": -12.461565971374512, + "step": 11620 + }, + { + "epoch": 0.69, + "learning_rate": 1.2994389741791152e-06, + "logits/chosen": -2.6438167095184326, + "logits/rejected": -2.2469024658203125, + "logps/chosen": -177.0650634765625, + "logps/rejected": -1190.037353515625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0854206085205078, + "rewards/margins": 10.371801376342773, + "rewards/rejected": -11.457222938537598, + "step": 11630 + }, + { + "epoch": 0.69, + "learning_rate": 1.294877153725112e-06, + "logits/chosen": -2.6704092025756836, + "logits/rejected": -2.285759210586548, + "logps/chosen": -194.0321807861328, + "logps/rejected": -1307.4847412109375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2456846237182617, + "rewards/margins": 11.391227722167969, + "rewards/rejected": -12.636914253234863, + "step": 11640 + }, + { + "epoch": 0.69, + "learning_rate": 1.2903205545845378e-06, + "logits/chosen": -2.669640064239502, + "logits/rejected": -2.2860941886901855, + "logps/chosen": -172.52822875976562, + "logps/rejected": -1235.365966796875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0323947668075562, + "rewards/margins": 10.878218650817871, + "rewards/rejected": -11.910614013671875, + "step": 11650 + }, + { + "epoch": 0.7, + "learning_rate": 1.285769196499308e-06, + "logits/chosen": -2.7038121223449707, + "logits/rejected": -2.2927186489105225, + "logps/chosen": -184.20726013183594, + "logps/rejected": -1362.823974609375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0956871509552002, + "rewards/margins": 12.08732795715332, + "rewards/rejected": -13.183016777038574, + "step": 11660 + }, + { + "epoch": 0.7, + "learning_rate": 1.28122309918863e-06, + "logits/chosen": -2.6596062183380127, + "logits/rejected": -2.2318673133850098, + "logps/chosen": -193.51368713378906, + "logps/rejected": -1250.417236328125, + "loss": 0.0161, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.188525915145874, + "rewards/margins": 10.858878135681152, + "rewards/rejected": -12.047405242919922, + "step": 11670 + }, + { + "epoch": 0.7, + "learning_rate": 1.2766822823489175e-06, + "logits/chosen": -2.667905330657959, + "logits/rejected": -2.2183101177215576, + "logps/chosen": -176.84426879882812, + "logps/rejected": -1153.181884765625, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0317151546478271, + "rewards/margins": 10.05436897277832, + "rewards/rejected": -11.086084365844727, + "step": 11680 + }, + { + "epoch": 0.7, + "learning_rate": 1.2721467656537074e-06, + "logits/chosen": -2.7191903591156006, + "logits/rejected": -2.2563586235046387, + "logps/chosen": -167.41163635253906, + "logps/rejected": -1297.9930419921875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9730628132820129, + "rewards/margins": 11.55770492553711, + "rewards/rejected": -12.530767440795898, + "step": 11690 + }, + { + "epoch": 0.7, + "learning_rate": 1.2676165687535719e-06, + "logits/chosen": -2.689013957977295, + "logits/rejected": -2.3450937271118164, + "logps/chosen": -152.34751892089844, + "logps/rejected": -1224.363525390625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.807723343372345, + "rewards/margins": 10.987044334411621, + "rewards/rejected": -11.794767379760742, + "step": 11700 + }, + { + "epoch": 0.7, + "learning_rate": 1.2630917112760365e-06, + "logits/chosen": -2.687070369720459, + "logits/rejected": -2.2182226181030273, + "logps/chosen": -178.33847045898438, + "logps/rejected": -1239.14111328125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0956488847732544, + "rewards/margins": 10.834636688232422, + "rewards/rejected": -11.930285453796387, + "step": 11710 + }, + { + "epoch": 0.7, + "learning_rate": 1.2585722128254896e-06, + "logits/chosen": -2.6702404022216797, + "logits/rejected": -2.2642807960510254, + "logps/chosen": -164.9624481201172, + "logps/rejected": -1234.2159423828125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9125876426696777, + "rewards/margins": 10.986547470092773, + "rewards/rejected": -11.899134635925293, + "step": 11720 + }, + { + "epoch": 0.7, + "learning_rate": 1.2540580929831065e-06, + "logits/chosen": -2.6474432945251465, + "logits/rejected": -2.1999526023864746, + "logps/chosen": -158.49305725097656, + "logps/rejected": -1199.693115234375, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8346771001815796, + "rewards/margins": 10.70768928527832, + "rewards/rejected": -11.542366981506348, + "step": 11730 + }, + { + "epoch": 0.7, + "learning_rate": 1.249549371306753e-06, + "logits/chosen": -2.678654193878174, + "logits/rejected": -2.2136974334716797, + "logps/chosen": -185.2541961669922, + "logps/rejected": -1230.862548828125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.113336205482483, + "rewards/margins": 10.734987258911133, + "rewards/rejected": -11.848322868347168, + "step": 11740 + }, + { + "epoch": 0.7, + "learning_rate": 1.2450460673309115e-06, + "logits/chosen": -2.6717171669006348, + "logits/rejected": -2.2952756881713867, + "logps/chosen": -163.7808380126953, + "logps/rejected": -1220.9493408203125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9566238522529602, + "rewards/margins": 10.802164077758789, + "rewards/rejected": -11.758787155151367, + "step": 11750 + }, + { + "epoch": 0.7, + "learning_rate": 1.2405482005665894e-06, + "logits/chosen": -2.658249616622925, + "logits/rejected": -2.154463291168213, + "logps/chosen": -178.73484802246094, + "logps/rejected": -1264.4908447265625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0500277280807495, + "rewards/margins": 11.150541305541992, + "rewards/rejected": -12.200569152832031, + "step": 11760 + }, + { + "epoch": 0.7, + "learning_rate": 1.236055790501238e-06, + "logits/chosen": -2.7080893516540527, + "logits/rejected": -2.1892635822296143, + "logps/chosen": -167.18630981445312, + "logps/rejected": -1232.641845703125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9356141090393066, + "rewards/margins": 10.950101852416992, + "rewards/rejected": -11.885717391967773, + "step": 11770 + }, + { + "epoch": 0.7, + "learning_rate": 1.231568856598666e-06, + "logits/chosen": -2.7048535346984863, + "logits/rejected": -2.2809369564056396, + "logps/chosen": -169.42620849609375, + "logps/rejected": -1253.554931640625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.000076174736023, + "rewards/margins": 11.080299377441406, + "rewards/rejected": -12.080375671386719, + "step": 11780 + }, + { + "epoch": 0.7, + "learning_rate": 1.2270874182989566e-06, + "logits/chosen": -2.6488308906555176, + "logits/rejected": -2.161964178085327, + "logps/chosen": -157.1221923828125, + "logps/rejected": -1067.824951171875, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9346389770507812, + "rewards/margins": 9.307622909545898, + "rewards/rejected": -10.24226188659668, + "step": 11790 + }, + { + "epoch": 0.7, + "learning_rate": 1.2226114950183836e-06, + "logits/chosen": -2.6762447357177734, + "logits/rejected": -2.2160847187042236, + "logps/chosen": -166.7312469482422, + "logps/rejected": -1209.5733642578125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9776681661605835, + "rewards/margins": 10.67459774017334, + "rewards/rejected": -11.652264595031738, + "step": 11800 + }, + { + "epoch": 0.7, + "learning_rate": 1.2181411061493229e-06, + "logits/chosen": -2.6886744499206543, + "logits/rejected": -2.1918797492980957, + "logps/chosen": -164.03114318847656, + "logps/rejected": -1207.5887451171875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9666979908943176, + "rewards/margins": 10.65786361694336, + "rewards/rejected": -11.624561309814453, + "step": 11810 + }, + { + "epoch": 0.7, + "learning_rate": 1.213676271060178e-06, + "logits/chosen": -2.6682701110839844, + "logits/rejected": -2.210137128829956, + "logps/chosen": -157.25338745117188, + "logps/rejected": -1236.2374267578125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8700025677680969, + "rewards/margins": 11.043835639953613, + "rewards/rejected": -11.913838386535645, + "step": 11820 + }, + { + "epoch": 0.71, + "learning_rate": 1.2092170090952838e-06, + "logits/chosen": -2.681671380996704, + "logits/rejected": -2.2425122261047363, + "logps/chosen": -185.64088439941406, + "logps/rejected": -1307.0732421875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1284072399139404, + "rewards/margins": 11.496757507324219, + "rewards/rejected": -12.625164031982422, + "step": 11830 + }, + { + "epoch": 0.71, + "learning_rate": 1.204763339574833e-06, + "logits/chosen": -2.6535892486572266, + "logits/rejected": -2.2164878845214844, + "logps/chosen": -172.20343017578125, + "logps/rejected": -1361.5809326171875, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0151000022888184, + "rewards/margins": 12.163667678833008, + "rewards/rejected": -13.178767204284668, + "step": 11840 + }, + { + "epoch": 0.71, + "learning_rate": 1.2003152817947878e-06, + "logits/chosen": -2.667032480239868, + "logits/rejected": -2.200667381286621, + "logps/chosen": -163.82644653320312, + "logps/rejected": -1169.5721435546875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.92219078540802, + "rewards/margins": 10.342397689819336, + "rewards/rejected": -11.264589309692383, + "step": 11850 + }, + { + "epoch": 0.71, + "learning_rate": 1.1958728550267958e-06, + "logits/chosen": -2.668520927429199, + "logits/rejected": -2.2707881927490234, + "logps/chosen": -161.77818298339844, + "logps/rejected": -1139.8802490234375, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9035550355911255, + "rewards/margins": 10.056806564331055, + "rewards/rejected": -10.96036148071289, + "step": 11860 + }, + { + "epoch": 0.71, + "learning_rate": 1.1914360785181099e-06, + "logits/chosen": -2.6910767555236816, + "logits/rejected": -2.2122721672058105, + "logps/chosen": -174.186767578125, + "logps/rejected": -1055.9832763671875, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.065211534500122, + "rewards/margins": 9.051030158996582, + "rewards/rejected": -10.116240501403809, + "step": 11870 + }, + { + "epoch": 0.71, + "learning_rate": 1.1870049714915e-06, + "logits/chosen": -2.709481716156006, + "logits/rejected": -2.3278183937072754, + "logps/chosen": -181.68829345703125, + "logps/rejected": -1392.965087890625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0728803873062134, + "rewards/margins": 12.423988342285156, + "rewards/rejected": -13.496870040893555, + "step": 11880 + }, + { + "epoch": 0.71, + "learning_rate": 1.182579553145175e-06, + "logits/chosen": -2.6649162769317627, + "logits/rejected": -2.2277672290802, + "logps/chosen": -158.9559326171875, + "logps/rejected": -1329.152099609375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8659127950668335, + "rewards/margins": 11.969205856323242, + "rewards/rejected": -12.835118293762207, + "step": 11890 + }, + { + "epoch": 0.71, + "learning_rate": 1.1781598426526935e-06, + "logits/chosen": -2.6503052711486816, + "logits/rejected": -2.2765793800354004, + "logps/chosen": -164.4527130126953, + "logps/rejected": -1314.384033203125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0304515361785889, + "rewards/margins": 11.653807640075684, + "rewards/rejected": -12.684259414672852, + "step": 11900 + }, + { + "epoch": 0.71, + "learning_rate": 1.1737458591628898e-06, + "logits/chosen": -2.6715242862701416, + "logits/rejected": -2.2028393745422363, + "logps/chosen": -160.84860229492188, + "logps/rejected": -1242.447509765625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8587220311164856, + "rewards/margins": 11.128213882446289, + "rewards/rejected": -11.986934661865234, + "step": 11910 + }, + { + "epoch": 0.71, + "learning_rate": 1.1693376217997795e-06, + "logits/chosen": -2.678063154220581, + "logits/rejected": -2.161362409591675, + "logps/chosen": -193.4080352783203, + "logps/rejected": -1244.3980712890625, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2295480966567993, + "rewards/margins": 10.776708602905273, + "rewards/rejected": -12.006256103515625, + "step": 11920 + }, + { + "epoch": 0.71, + "learning_rate": 1.164935149662485e-06, + "logits/chosen": -2.644008159637451, + "logits/rejected": -2.2190909385681152, + "logps/chosen": -179.3295440673828, + "logps/rejected": -1232.16015625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1008204221725464, + "rewards/margins": 10.773138999938965, + "rewards/rejected": -11.873958587646484, + "step": 11930 + }, + { + "epoch": 0.71, + "learning_rate": 1.1605384618251533e-06, + "logits/chosen": -2.699425220489502, + "logits/rejected": -2.18827486038208, + "logps/chosen": -170.82801818847656, + "logps/rejected": -1274.0860595703125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9689435958862305, + "rewards/margins": 11.329154014587402, + "rewards/rejected": -12.298096656799316, + "step": 11940 + }, + { + "epoch": 0.71, + "learning_rate": 1.156147577336865e-06, + "logits/chosen": -2.678511142730713, + "logits/rejected": -2.2204604148864746, + "logps/chosen": -176.87564086914062, + "logps/rejected": -1340.9168701171875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1032963991165161, + "rewards/margins": 11.873059272766113, + "rewards/rejected": -12.976354598999023, + "step": 11950 + }, + { + "epoch": 0.71, + "learning_rate": 1.1517625152215603e-06, + "logits/chosen": -2.673372983932495, + "logits/rejected": -2.216002941131592, + "logps/chosen": -192.49183654785156, + "logps/rejected": -1290.7626953125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1558568477630615, + "rewards/margins": 11.306575775146484, + "rewards/rejected": -12.462432861328125, + "step": 11960 + }, + { + "epoch": 0.71, + "learning_rate": 1.1473832944779525e-06, + "logits/chosen": -2.6785974502563477, + "logits/rejected": -2.2316765785217285, + "logps/chosen": -190.4306182861328, + "logps/rejected": -1287.8900146484375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2031124830245972, + "rewards/margins": 11.224150657653809, + "rewards/rejected": -12.427265167236328, + "step": 11970 + }, + { + "epoch": 0.71, + "learning_rate": 1.1430099340794482e-06, + "logits/chosen": -2.65739107131958, + "logits/rejected": -2.273519992828369, + "logps/chosen": -191.32949829101562, + "logps/rejected": -1294.4039306640625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2415682077407837, + "rewards/margins": 11.256373405456543, + "rewards/rejected": -12.497941970825195, + "step": 11980 + }, + { + "epoch": 0.71, + "learning_rate": 1.138642452974059e-06, + "logits/chosen": -2.67218017578125, + "logits/rejected": -2.2821950912475586, + "logps/chosen": -177.0503692626953, + "logps/rejected": -1250.2603759765625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.095555067062378, + "rewards/margins": 10.965291976928711, + "rewards/rejected": -12.060847282409668, + "step": 11990 + }, + { + "epoch": 0.72, + "learning_rate": 1.1342808700843297e-06, + "logits/chosen": -2.6717300415039062, + "logits/rejected": -2.182730197906494, + "logps/chosen": -184.40335083007812, + "logps/rejected": -1261.567138671875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1220561265945435, + "rewards/margins": 11.054956436157227, + "rewards/rejected": -12.17701244354248, + "step": 12000 + }, + { + "epoch": 0.72, + "learning_rate": 1.1299252043072478e-06, + "logits/chosen": -2.6794955730438232, + "logits/rejected": -2.216703414916992, + "logps/chosen": -174.71340942382812, + "logps/rejected": -1267.625244140625, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.072036862373352, + "rewards/margins": 11.160558700561523, + "rewards/rejected": -12.232595443725586, + "step": 12010 + }, + { + "epoch": 0.72, + "learning_rate": 1.1255754745141617e-06, + "logits/chosen": -2.6612839698791504, + "logits/rejected": -2.2987060546875, + "logps/chosen": -193.7748565673828, + "logps/rejected": -1137.1163330078125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2622792720794678, + "rewards/margins": 9.6884765625, + "rewards/rejected": -10.950756072998047, + "step": 12020 + }, + { + "epoch": 0.72, + "learning_rate": 1.1212316995507079e-06, + "logits/chosen": -2.692817211151123, + "logits/rejected": -2.246981143951416, + "logps/chosen": -203.880859375, + "logps/rejected": -1277.25, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2494544982910156, + "rewards/margins": 11.091383934020996, + "rewards/rejected": -12.340837478637695, + "step": 12030 + }, + { + "epoch": 0.72, + "learning_rate": 1.1168938982367162e-06, + "logits/chosen": -2.6536331176757812, + "logits/rejected": -2.2482407093048096, + "logps/chosen": -173.14845275878906, + "logps/rejected": -1227.80517578125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0356905460357666, + "rewards/margins": 10.808980941772461, + "rewards/rejected": -11.844671249389648, + "step": 12040 + }, + { + "epoch": 0.72, + "learning_rate": 1.112562089366139e-06, + "logits/chosen": -2.6278417110443115, + "logits/rejected": -2.1871559619903564, + "logps/chosen": -211.38430786132812, + "logps/rejected": -1264.448486328125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3910199403762817, + "rewards/margins": 10.821457862854004, + "rewards/rejected": -12.212477684020996, + "step": 12050 + }, + { + "epoch": 0.72, + "learning_rate": 1.108236291706965e-06, + "logits/chosen": -2.645273447036743, + "logits/rejected": -2.227074384689331, + "logps/chosen": -195.989501953125, + "logps/rejected": -1285.885498046875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.260510802268982, + "rewards/margins": 11.161626815795898, + "rewards/rejected": -12.422136306762695, + "step": 12060 + }, + { + "epoch": 0.72, + "learning_rate": 1.1039165240011388e-06, + "logits/chosen": -2.681525230407715, + "logits/rejected": -2.178351640701294, + "logps/chosen": -190.31344604492188, + "logps/rejected": -1220.031982421875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1926311254501343, + "rewards/margins": 10.562582015991211, + "rewards/rejected": -11.755213737487793, + "step": 12070 + }, + { + "epoch": 0.72, + "learning_rate": 1.0996028049644792e-06, + "logits/chosen": -2.662846326828003, + "logits/rejected": -2.2142162322998047, + "logps/chosen": -214.1508331298828, + "logps/rejected": -1111.049072265625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4760183095932007, + "rewards/margins": 9.213679313659668, + "rewards/rejected": -10.689699172973633, + "step": 12080 + }, + { + "epoch": 0.72, + "learning_rate": 1.095295153286599e-06, + "logits/chosen": -2.6866848468780518, + "logits/rejected": -2.2753748893737793, + "logps/chosen": -183.940185546875, + "logps/rejected": -1207.918212890625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1320769786834717, + "rewards/margins": 10.512657165527344, + "rewards/rejected": -11.644734382629395, + "step": 12090 + }, + { + "epoch": 0.72, + "learning_rate": 1.090993587630824e-06, + "logits/chosen": -2.6410410404205322, + "logits/rejected": -2.1447455883026123, + "logps/chosen": -217.2635498046875, + "logps/rejected": -1309.773681640625, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.434149980545044, + "rewards/margins": 11.219610214233398, + "rewards/rejected": -12.653759956359863, + "step": 12100 + }, + { + "epoch": 0.72, + "learning_rate": 1.0866981266341084e-06, + "logits/chosen": -2.7037551403045654, + "logits/rejected": -2.225490093231201, + "logps/chosen": -206.97787475585938, + "logps/rejected": -1398.537109375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2295153141021729, + "rewards/margins": 12.304937362670898, + "rewards/rejected": -13.534452438354492, + "step": 12110 + }, + { + "epoch": 0.72, + "learning_rate": 1.082408788906964e-06, + "logits/chosen": -2.627535581588745, + "logits/rejected": -2.1925041675567627, + "logps/chosen": -175.8035125732422, + "logps/rejected": -1310.95458984375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.082283616065979, + "rewards/margins": 11.59314250946045, + "rewards/rejected": -12.67542552947998, + "step": 12120 + }, + { + "epoch": 0.72, + "learning_rate": 1.078125593033366e-06, + "logits/chosen": -2.6692299842834473, + "logits/rejected": -2.1882576942443848, + "logps/chosen": -199.92593383789062, + "logps/rejected": -1419.90625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2494189739227295, + "rewards/margins": 12.493829727172852, + "rewards/rejected": -13.743250846862793, + "step": 12130 + }, + { + "epoch": 0.72, + "learning_rate": 1.0738485575706834e-06, + "logits/chosen": -2.676517963409424, + "logits/rejected": -2.2814550399780273, + "logps/chosen": -197.37060546875, + "logps/rejected": -1307.5328369140625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.236182689666748, + "rewards/margins": 11.39842414855957, + "rewards/rejected": -12.63460636138916, + "step": 12140 + }, + { + "epoch": 0.72, + "learning_rate": 1.0695777010495936e-06, + "logits/chosen": -2.6884961128234863, + "logits/rejected": -2.252476930618286, + "logps/chosen": -201.0217742919922, + "logps/rejected": -1290.505126953125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2830235958099365, + "rewards/margins": 11.179656028747559, + "rewards/rejected": -12.462679862976074, + "step": 12150 + }, + { + "epoch": 0.73, + "learning_rate": 1.065313041974003e-06, + "logits/chosen": -2.639547824859619, + "logits/rejected": -2.2105681896209717, + "logps/chosen": -181.94541931152344, + "logps/rejected": -1448.171142578125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.108457326889038, + "rewards/margins": 12.92326545715332, + "rewards/rejected": -14.031723022460938, + "step": 12160 + }, + { + "epoch": 0.73, + "learning_rate": 1.0610545988209671e-06, + "logits/chosen": -2.7427210807800293, + "logits/rejected": -2.293287992477417, + "logps/chosen": -202.73580932617188, + "logps/rejected": -1285.069580078125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3460676670074463, + "rewards/margins": 11.057480812072754, + "rewards/rejected": -12.403548240661621, + "step": 12170 + }, + { + "epoch": 0.73, + "learning_rate": 1.0568023900406108e-06, + "logits/chosen": -2.691849708557129, + "logits/rejected": -2.2438807487487793, + "logps/chosen": -186.82894897460938, + "logps/rejected": -1246.1009521484375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0914620161056519, + "rewards/margins": 10.9232177734375, + "rewards/rejected": -12.014680862426758, + "step": 12180 + }, + { + "epoch": 0.73, + "learning_rate": 1.0525564340560476e-06, + "logits/chosen": -2.6386990547180176, + "logits/rejected": -2.1966171264648438, + "logps/chosen": -185.1238250732422, + "logps/rejected": -1255.1148681640625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2038465738296509, + "rewards/margins": 10.910414695739746, + "rewards/rejected": -12.114261627197266, + "step": 12190 + }, + { + "epoch": 0.73, + "learning_rate": 1.048316749263298e-06, + "logits/chosen": -2.6191465854644775, + "logits/rejected": -2.2068686485290527, + "logps/chosen": -237.9846649169922, + "logps/rejected": -1279.3477783203125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7072252035140991, + "rewards/margins": 10.642881393432617, + "rewards/rejected": -12.350106239318848, + "step": 12200 + }, + { + "epoch": 0.73, + "learning_rate": 1.044083354031217e-06, + "logits/chosen": -2.6836929321289062, + "logits/rejected": -2.289625406265259, + "logps/chosen": -184.58509826660156, + "logps/rejected": -1135.781982421875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1118168830871582, + "rewards/margins": 9.80505657196045, + "rewards/rejected": -10.916872024536133, + "step": 12210 + }, + { + "epoch": 0.73, + "learning_rate": 1.039856266701404e-06, + "logits/chosen": -2.6601803302764893, + "logits/rejected": -2.189298152923584, + "logps/chosen": -190.9080352783203, + "logps/rejected": -1322.838623046875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1929147243499756, + "rewards/margins": 11.594941139221191, + "rewards/rejected": -12.78785514831543, + "step": 12220 + }, + { + "epoch": 0.73, + "learning_rate": 1.035635505588132e-06, + "logits/chosen": -2.6691157817840576, + "logits/rejected": -2.235931873321533, + "logps/chosen": -166.89657592773438, + "logps/rejected": -1178.704833984375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0135897397994995, + "rewards/margins": 10.321293830871582, + "rewards/rejected": -11.334881782531738, + "step": 12230 + }, + { + "epoch": 0.73, + "learning_rate": 1.0314210889782642e-06, + "logits/chosen": -2.6448354721069336, + "logits/rejected": -2.2196128368377686, + "logps/chosen": -188.511474609375, + "logps/rejected": -1234.0687255859375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2140707969665527, + "rewards/margins": 10.684258460998535, + "rewards/rejected": -11.898329734802246, + "step": 12240 + }, + { + "epoch": 0.73, + "learning_rate": 1.0272130351311758e-06, + "logits/chosen": -2.6663403511047363, + "logits/rejected": -2.2466037273406982, + "logps/chosen": -189.37844848632812, + "logps/rejected": -1187.046142578125, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1639484167099, + "rewards/margins": 10.26215648651123, + "rewards/rejected": -11.426103591918945, + "step": 12250 + }, + { + "epoch": 0.73, + "learning_rate": 1.0230113622786744e-06, + "logits/chosen": -2.6671876907348633, + "logits/rejected": -2.3231263160705566, + "logps/chosen": -179.62448120117188, + "logps/rejected": -1230.8353271484375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0858711004257202, + "rewards/margins": 10.794492721557617, + "rewards/rejected": -11.880363464355469, + "step": 12260 + }, + { + "epoch": 0.73, + "learning_rate": 1.0188160886249219e-06, + "logits/chosen": -2.6754186153411865, + "logits/rejected": -2.2237744331359863, + "logps/chosen": -164.10980224609375, + "logps/rejected": -1253.9830322265625, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9184304475784302, + "rewards/margins": 11.18305778503418, + "rewards/rejected": -12.101489067077637, + "step": 12270 + }, + { + "epoch": 0.73, + "learning_rate": 1.0146272323463548e-06, + "logits/chosen": -2.6572108268737793, + "logits/rejected": -2.210970640182495, + "logps/chosen": -185.42538452148438, + "logps/rejected": -1337.664306640625, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1407896280288696, + "rewards/margins": 11.784409523010254, + "rewards/rejected": -12.925198554992676, + "step": 12280 + }, + { + "epoch": 0.73, + "learning_rate": 1.0104448115916035e-06, + "logits/chosen": -2.671604871749878, + "logits/rejected": -2.2632572650909424, + "logps/chosen": -143.889892578125, + "logps/rejected": -1244.36474609375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7241929769515991, + "rewards/margins": 11.282548904418945, + "rewards/rejected": -12.006742477416992, + "step": 12290 + }, + { + "epoch": 0.73, + "learning_rate": 1.0062688444814208e-06, + "logits/chosen": -2.6593387126922607, + "logits/rejected": -2.259366512298584, + "logps/chosen": -183.2617950439453, + "logps/rejected": -1187.4873046875, + "loss": 0.0126, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1031699180603027, + "rewards/margins": 10.335807800292969, + "rewards/rejected": -11.438979148864746, + "step": 12300 + }, + { + "epoch": 0.73, + "learning_rate": 1.0020993491085936e-06, + "logits/chosen": -2.671769618988037, + "logits/rejected": -2.2467968463897705, + "logps/chosen": -157.1591339111328, + "logps/rejected": -1172.5406494140625, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8475213050842285, + "rewards/margins": 10.435922622680664, + "rewards/rejected": -11.283443450927734, + "step": 12310 + }, + { + "epoch": 0.73, + "learning_rate": 9.979363435378717e-07, + "logits/chosen": -2.658071994781494, + "logits/rejected": -2.277698278427124, + "logps/chosen": -155.0120849609375, + "logps/rejected": -1244.922119140625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8463680148124695, + "rewards/margins": 11.151788711547852, + "rewards/rejected": -11.99815559387207, + "step": 12320 + }, + { + "epoch": 0.74, + "learning_rate": 9.937798458058864e-07, + "logits/chosen": -2.611347198486328, + "logits/rejected": -2.1982204914093018, + "logps/chosen": -152.5498504638672, + "logps/rejected": -1221.0391845703125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8382118344306946, + "rewards/margins": 10.927261352539062, + "rewards/rejected": -11.765473365783691, + "step": 12330 + }, + { + "epoch": 0.74, + "learning_rate": 9.896298739210745e-07, + "logits/chosen": -2.6760573387145996, + "logits/rejected": -2.262755870819092, + "logps/chosen": -161.95468139648438, + "logps/rejected": -1308.771240234375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9251921772956848, + "rewards/margins": 11.71369743347168, + "rewards/rejected": -12.638891220092773, + "step": 12340 + }, + { + "epoch": 0.74, + "learning_rate": 9.85486445863597e-07, + "logits/chosen": -2.6713201999664307, + "logits/rejected": -2.2018585205078125, + "logps/chosen": -145.06082153320312, + "logps/rejected": -1328.052001953125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8253404498100281, + "rewards/margins": 12.014080047607422, + "rewards/rejected": -12.8394193649292, + "step": 12350 + }, + { + "epoch": 0.74, + "learning_rate": 9.813495795852646e-07, + "logits/chosen": -2.6409125328063965, + "logits/rejected": -2.204023838043213, + "logps/chosen": -168.88502502441406, + "logps/rejected": -1289.7230224609375, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0140244960784912, + "rewards/margins": 11.437627792358398, + "rewards/rejected": -12.451652526855469, + "step": 12360 + }, + { + "epoch": 0.74, + "learning_rate": 9.772192930094588e-07, + "logits/chosen": -2.637007474899292, + "logits/rejected": -2.121833324432373, + "logps/chosen": -152.52601623535156, + "logps/rejected": -1168.227294921875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8251526951789856, + "rewards/margins": 10.420989990234375, + "rewards/rejected": -11.246142387390137, + "step": 12370 + }, + { + "epoch": 0.74, + "learning_rate": 9.730956040310499e-07, + "logits/chosen": -2.691771984100342, + "logits/rejected": -2.2512869834899902, + "logps/chosen": -174.83450317382812, + "logps/rejected": -1134.830078125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.024186611175537, + "rewards/margins": 9.885404586791992, + "rewards/rejected": -10.909590721130371, + "step": 12380 + }, + { + "epoch": 0.74, + "learning_rate": 9.689785305163307e-07, + "logits/chosen": -2.664811372756958, + "logits/rejected": -2.2151479721069336, + "logps/chosen": -145.58462524414062, + "logps/rejected": -1182.7923583984375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7275064587593079, + "rewards/margins": 10.661162376403809, + "rewards/rejected": -11.388669967651367, + "step": 12390 + }, + { + "epoch": 0.74, + "learning_rate": 9.648680903029245e-07, + "logits/chosen": -2.6662790775299072, + "logits/rejected": -2.287348985671997, + "logps/chosen": -143.27102661132812, + "logps/rejected": -1194.017333984375, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7125800251960754, + "rewards/margins": 10.792291641235352, + "rewards/rejected": -11.50487232208252, + "step": 12400 + }, + { + "epoch": 0.74, + "learning_rate": 9.607643011997195e-07, + "logits/chosen": -2.6860768795013428, + "logits/rejected": -2.2764711380004883, + "logps/chosen": -173.8489532470703, + "logps/rejected": -1296.558837890625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0201029777526855, + "rewards/margins": 11.49165153503418, + "rewards/rejected": -12.511754035949707, + "step": 12410 + }, + { + "epoch": 0.74, + "learning_rate": 9.566671809867864e-07, + "logits/chosen": -2.6700212955474854, + "logits/rejected": -2.2919955253601074, + "logps/chosen": -155.58798217773438, + "logps/rejected": -1162.3597412109375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8391944766044617, + "rewards/margins": 10.340049743652344, + "rewards/rejected": -11.179243087768555, + "step": 12420 + }, + { + "epoch": 0.74, + "learning_rate": 9.52576747415302e-07, + "logits/chosen": -2.665966749191284, + "logits/rejected": -2.326286792755127, + "logps/chosen": -150.83148193359375, + "logps/rejected": -1322.6256103515625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7846275568008423, + "rewards/margins": 11.986906051635742, + "rewards/rejected": -12.77153491973877, + "step": 12430 + }, + { + "epoch": 0.74, + "learning_rate": 9.484930182074722e-07, + "logits/chosen": -2.657564640045166, + "logits/rejected": -2.22395920753479, + "logps/chosen": -168.1222686767578, + "logps/rejected": -1227.8724365234375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9654632806777954, + "rewards/margins": 10.863714218139648, + "rewards/rejected": -11.829176902770996, + "step": 12440 + }, + { + "epoch": 0.74, + "learning_rate": 9.444160110564563e-07, + "logits/chosen": -2.677396297454834, + "logits/rejected": -2.18302321434021, + "logps/chosen": -149.75918579101562, + "logps/rejected": -1145.447509765625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7426698207855225, + "rewards/margins": 10.266037940979004, + "rewards/rejected": -11.008707046508789, + "step": 12450 + }, + { + "epoch": 0.74, + "learning_rate": 9.403457436262906e-07, + "logits/chosen": -2.654158592224121, + "logits/rejected": -2.240130662918091, + "logps/chosen": -183.4116668701172, + "logps/rejected": -1234.705078125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1435333490371704, + "rewards/margins": 10.754674911499023, + "rewards/rejected": -11.898208618164062, + "step": 12460 + }, + { + "epoch": 0.74, + "learning_rate": 9.362822335518062e-07, + "logits/chosen": -2.620704174041748, + "logits/rejected": -2.1191720962524414, + "logps/chosen": -181.8234100341797, + "logps/rejected": -1332.8526611328125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0477808713912964, + "rewards/margins": 11.827988624572754, + "rewards/rejected": -12.875768661499023, + "step": 12470 + }, + { + "epoch": 0.74, + "learning_rate": 9.322254984385651e-07, + "logits/chosen": -2.674955368041992, + "logits/rejected": -2.2813265323638916, + "logps/chosen": -139.9225616455078, + "logps/rejected": -1191.1591796875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7620293498039246, + "rewards/margins": 10.708468437194824, + "rewards/rejected": -11.470499038696289, + "step": 12480 + }, + { + "epoch": 0.74, + "learning_rate": 9.281755558627686e-07, + "logits/chosen": -2.6602911949157715, + "logits/rejected": -2.1803629398345947, + "logps/chosen": -158.12271118164062, + "logps/rejected": -1334.6392822265625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8359683752059937, + "rewards/margins": 12.070714950561523, + "rewards/rejected": -12.906682968139648, + "step": 12490 + }, + { + "epoch": 0.75, + "learning_rate": 9.241324233711929e-07, + "logits/chosen": -2.6210074424743652, + "logits/rejected": -2.2206318378448486, + "logps/chosen": -152.96688842773438, + "logps/rejected": -1248.0006103515625, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8983484506607056, + "rewards/margins": 11.138702392578125, + "rewards/rejected": -12.037050247192383, + "step": 12500 + }, + { + "epoch": 0.75, + "learning_rate": 9.200961184811075e-07, + "logits/chosen": -2.6159567832946777, + "logits/rejected": -2.2066009044647217, + "logps/chosen": -181.7709197998047, + "logps/rejected": -1248.8392333984375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0818769931793213, + "rewards/margins": 10.952686309814453, + "rewards/rejected": -12.034563064575195, + "step": 12510 + }, + { + "epoch": 0.75, + "learning_rate": 9.160666586802011e-07, + "logits/chosen": -2.6807777881622314, + "logits/rejected": -2.228256940841675, + "logps/chosen": -150.993408203125, + "logps/rejected": -1311.835205078125, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.825247585773468, + "rewards/margins": 11.840875625610352, + "rewards/rejected": -12.66612434387207, + "step": 12520 + }, + { + "epoch": 0.75, + "learning_rate": 9.12044061426505e-07, + "logits/chosen": -2.685821771621704, + "logits/rejected": -2.308408260345459, + "logps/chosen": -152.82186889648438, + "logps/rejected": -1244.049560546875, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8500364422798157, + "rewards/margins": 11.15960693359375, + "rewards/rejected": -12.009645462036133, + "step": 12530 + }, + { + "epoch": 0.75, + "learning_rate": 9.080283441483182e-07, + "logits/chosen": -2.6817920207977295, + "logits/rejected": -2.2532148361206055, + "logps/chosen": -155.13894653320312, + "logps/rejected": -1286.9625244140625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8206734657287598, + "rewards/margins": 11.612648963928223, + "rewards/rejected": -12.433321952819824, + "step": 12540 + }, + { + "epoch": 0.75, + "learning_rate": 9.040195242441322e-07, + "logits/chosen": -2.658567428588867, + "logits/rejected": -2.1805081367492676, + "logps/chosen": -150.51686096191406, + "logps/rejected": -1141.17724609375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8117367029190063, + "rewards/margins": 10.165749549865723, + "rewards/rejected": -10.977485656738281, + "step": 12550 + }, + { + "epoch": 0.75, + "learning_rate": 9.000176190825513e-07, + "logits/chosen": -2.6792659759521484, + "logits/rejected": -2.2553915977478027, + "logps/chosen": -167.29043579101562, + "logps/rejected": -1228.302978515625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8949813842773438, + "rewards/margins": 10.941363334655762, + "rewards/rejected": -11.836343765258789, + "step": 12560 + }, + { + "epoch": 0.75, + "learning_rate": 8.960226460022272e-07, + "logits/chosen": -2.695333957672119, + "logits/rejected": -2.1948001384735107, + "logps/chosen": -174.39013671875, + "logps/rejected": -1158.2430419921875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0396320819854736, + "rewards/margins": 10.100898742675781, + "rewards/rejected": -11.140531539916992, + "step": 12570 + }, + { + "epoch": 0.75, + "learning_rate": 8.920346223117721e-07, + "logits/chosen": -2.6415774822235107, + "logits/rejected": -2.201040506362915, + "logps/chosen": -153.525146484375, + "logps/rejected": -1195.8531494140625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8580840826034546, + "rewards/margins": 10.666096687316895, + "rewards/rejected": -11.524181365966797, + "step": 12580 + }, + { + "epoch": 0.75, + "learning_rate": 8.88053565289691e-07, + "logits/chosen": -2.685497760772705, + "logits/rejected": -2.2530007362365723, + "logps/chosen": -166.4014434814453, + "logps/rejected": -1194.649169921875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9929316639900208, + "rewards/margins": 10.518499374389648, + "rewards/rejected": -11.511429786682129, + "step": 12590 + }, + { + "epoch": 0.75, + "learning_rate": 8.840794921843085e-07, + "logits/chosen": -2.624795913696289, + "logits/rejected": -2.2480812072753906, + "logps/chosen": -155.05343627929688, + "logps/rejected": -1237.7613525390625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9196068048477173, + "rewards/margins": 11.013126373291016, + "rewards/rejected": -11.932733535766602, + "step": 12600 + }, + { + "epoch": 0.75, + "learning_rate": 8.801124202136846e-07, + "logits/chosen": -2.701706647872925, + "logits/rejected": -2.2729063034057617, + "logps/chosen": -159.85902404785156, + "logps/rejected": -1218.8260498046875, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8510640263557434, + "rewards/margins": 10.89560604095459, + "rewards/rejected": -11.74666976928711, + "step": 12610 + }, + { + "epoch": 0.75, + "learning_rate": 8.761523665655508e-07, + "logits/chosen": -2.652740955352783, + "logits/rejected": -2.2721943855285645, + "logps/chosen": -161.44705200195312, + "logps/rejected": -1173.760498046875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9070202112197876, + "rewards/margins": 10.377522468566895, + "rewards/rejected": -11.28454303741455, + "step": 12620 + }, + { + "epoch": 0.75, + "learning_rate": 8.721993483972294e-07, + "logits/chosen": -2.616049289703369, + "logits/rejected": -2.278193950653076, + "logps/chosen": -147.48715209960938, + "logps/rejected": -1177.179931640625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.805441677570343, + "rewards/margins": 10.516280174255371, + "rewards/rejected": -11.321722984313965, + "step": 12630 + }, + { + "epoch": 0.75, + "learning_rate": 8.682533828355616e-07, + "logits/chosen": -2.681006908416748, + "logits/rejected": -2.210326671600342, + "logps/chosen": -158.94662475585938, + "logps/rejected": -1211.8001708984375, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8334416151046753, + "rewards/margins": 10.846317291259766, + "rewards/rejected": -11.67975902557373, + "step": 12640 + }, + { + "epoch": 0.75, + "learning_rate": 8.643144869768294e-07, + "logits/chosen": -2.708228588104248, + "logits/rejected": -2.373368740081787, + "logps/chosen": -189.62631225585938, + "logps/rejected": -1233.6568603515625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1873180866241455, + "rewards/margins": 10.70081615447998, + "rewards/rejected": -11.88813591003418, + "step": 12650 + }, + { + "epoch": 0.75, + "learning_rate": 8.6038267788669e-07, + "logits/chosen": -2.6643872261047363, + "logits/rejected": -2.256521701812744, + "logps/chosen": -161.9615936279297, + "logps/rejected": -1239.02880859375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.860692024230957, + "rewards/margins": 11.08536434173584, + "rewards/rejected": -11.94605541229248, + "step": 12660 + }, + { + "epoch": 0.76, + "learning_rate": 8.56457972600093e-07, + "logits/chosen": -2.6932599544525146, + "logits/rejected": -2.326667308807373, + "logps/chosen": -141.00509643554688, + "logps/rejected": -1195.75732421875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7428602576255798, + "rewards/margins": 10.78742790222168, + "rewards/rejected": -11.530289649963379, + "step": 12670 + }, + { + "epoch": 0.76, + "learning_rate": 8.525403881212083e-07, + "logits/chosen": -2.688525438308716, + "logits/rejected": -2.2554612159729004, + "logps/chosen": -153.75303649902344, + "logps/rejected": -1194.4525146484375, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8197674751281738, + "rewards/margins": 10.654682159423828, + "rewards/rejected": -11.474449157714844, + "step": 12680 + }, + { + "epoch": 0.76, + "learning_rate": 8.486299414233598e-07, + "logits/chosen": -2.683134078979492, + "logits/rejected": -2.237874984741211, + "logps/chosen": -188.1017608642578, + "logps/rejected": -1218.7811279296875, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2083886861801147, + "rewards/margins": 10.539608001708984, + "rewards/rejected": -11.74799633026123, + "step": 12690 + }, + { + "epoch": 0.76, + "learning_rate": 8.447266494489408e-07, + "logits/chosen": -2.6903889179229736, + "logits/rejected": -2.251967668533325, + "logps/chosen": -163.27590942382812, + "logps/rejected": -1198.499755859375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9330371618270874, + "rewards/margins": 10.607423782348633, + "rewards/rejected": -11.540462493896484, + "step": 12700 + }, + { + "epoch": 0.76, + "learning_rate": 8.408305291093488e-07, + "logits/chosen": -2.643890857696533, + "logits/rejected": -2.2460498809814453, + "logps/chosen": -176.0294647216797, + "logps/rejected": -1271.7115478515625, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0676785707473755, + "rewards/margins": 11.218488693237305, + "rewards/rejected": -12.286166191101074, + "step": 12710 + }, + { + "epoch": 0.76, + "learning_rate": 8.369415972849087e-07, + "logits/chosen": -2.6614792346954346, + "logits/rejected": -2.2340197563171387, + "logps/chosen": -164.5394744873047, + "logps/rejected": -1240.233642578125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8873366117477417, + "rewards/margins": 11.073686599731445, + "rewards/rejected": -11.961023330688477, + "step": 12720 + }, + { + "epoch": 0.76, + "learning_rate": 8.330598708248011e-07, + "logits/chosen": -2.685056209564209, + "logits/rejected": -2.2430357933044434, + "logps/chosen": -182.60873413085938, + "logps/rejected": -1230.0767822265625, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.116711139678955, + "rewards/margins": 10.743387222290039, + "rewards/rejected": -11.860098838806152, + "step": 12730 + }, + { + "epoch": 0.76, + "learning_rate": 8.291853665469887e-07, + "logits/chosen": -2.719602584838867, + "logits/rejected": -2.321622133255005, + "logps/chosen": -181.74244689941406, + "logps/rejected": -1205.73193359375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1487635374069214, + "rewards/margins": 10.465904235839844, + "rewards/rejected": -11.614667892456055, + "step": 12740 + }, + { + "epoch": 0.76, + "learning_rate": 8.253181012381409e-07, + "logits/chosen": -2.6575121879577637, + "logits/rejected": -2.2720494270324707, + "logps/chosen": -195.53575134277344, + "logps/rejected": -1303.7784423828125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3223384618759155, + "rewards/margins": 11.268315315246582, + "rewards/rejected": -12.590653419494629, + "step": 12750 + }, + { + "epoch": 0.76, + "learning_rate": 8.214580916535683e-07, + "logits/chosen": -2.6711089611053467, + "logits/rejected": -2.2366392612457275, + "logps/chosen": -180.1494598388672, + "logps/rejected": -1362.3228759765625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1187859773635864, + "rewards/margins": 12.059918403625488, + "rewards/rejected": -13.178703308105469, + "step": 12760 + }, + { + "epoch": 0.76, + "learning_rate": 8.176053545171403e-07, + "logits/chosen": -2.6228363513946533, + "logits/rejected": -2.222689628601074, + "logps/chosen": -161.51174926757812, + "logps/rejected": -1225.2271728515625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8777238130569458, + "rewards/margins": 10.940903663635254, + "rewards/rejected": -11.818626403808594, + "step": 12770 + }, + { + "epoch": 0.76, + "learning_rate": 8.13759906521221e-07, + "logits/chosen": -2.6365768909454346, + "logits/rejected": -2.195507764816284, + "logps/chosen": -191.83201599121094, + "logps/rejected": -1180.65625, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.217861294746399, + "rewards/margins": 10.143416404724121, + "rewards/rejected": -11.361279487609863, + "step": 12780 + }, + { + "epoch": 0.76, + "learning_rate": 8.099217643265928e-07, + "logits/chosen": -2.6504197120666504, + "logits/rejected": -2.1798901557922363, + "logps/chosen": -198.8717498779297, + "logps/rejected": -1325.856201171875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2890946865081787, + "rewards/margins": 11.52834701538086, + "rewards/rejected": -12.8174409866333, + "step": 12790 + }, + { + "epoch": 0.76, + "learning_rate": 8.06090944562385e-07, + "logits/chosen": -2.6366419792175293, + "logits/rejected": -2.1820199489593506, + "logps/chosen": -180.94772338867188, + "logps/rejected": -1316.943115234375, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1441292762756348, + "rewards/margins": 11.587011337280273, + "rewards/rejected": -12.731141090393066, + "step": 12800 + }, + { + "epoch": 0.76, + "learning_rate": 8.022674638259995e-07, + "logits/chosen": -2.687102794647217, + "logits/rejected": -2.200025796890259, + "logps/chosen": -195.7443084716797, + "logps/rejected": -1190.1416015625, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2137089967727661, + "rewards/margins": 10.235818862915039, + "rewards/rejected": -11.449528694152832, + "step": 12810 + }, + { + "epoch": 0.76, + "learning_rate": 7.984513386830453e-07, + "logits/chosen": -2.696150302886963, + "logits/rejected": -2.3030591011047363, + "logps/chosen": -189.81515502929688, + "logps/rejected": -1216.6669921875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2418229579925537, + "rewards/margins": 10.495587348937988, + "rewards/rejected": -11.737411499023438, + "step": 12820 + }, + { + "epoch": 0.77, + "learning_rate": 7.94642585667261e-07, + "logits/chosen": -2.7001521587371826, + "logits/rejected": -2.1935391426086426, + "logps/chosen": -200.95327758789062, + "logps/rejected": -1306.6044921875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2740213871002197, + "rewards/margins": 11.338061332702637, + "rewards/rejected": -12.612082481384277, + "step": 12830 + }, + { + "epoch": 0.77, + "learning_rate": 7.908412212804414e-07, + "logits/chosen": -2.64448881149292, + "logits/rejected": -2.1777243614196777, + "logps/chosen": -201.76072692871094, + "logps/rejected": -1281.703125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3328297138214111, + "rewards/margins": 11.045312881469727, + "rewards/rejected": -12.378142356872559, + "step": 12840 + }, + { + "epoch": 0.77, + "learning_rate": 7.870472619923755e-07, + "logits/chosen": -2.6775288581848145, + "logits/rejected": -2.235886335372925, + "logps/chosen": -160.6778564453125, + "logps/rejected": -1253.2672119140625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9192790985107422, + "rewards/margins": 11.16749382019043, + "rewards/rejected": -12.086771965026855, + "step": 12850 + }, + { + "epoch": 0.77, + "learning_rate": 7.832607242407631e-07, + "logits/chosen": -2.650095224380493, + "logits/rejected": -2.2364020347595215, + "logps/chosen": -183.8063507080078, + "logps/rejected": -1209.2640380859375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0781023502349854, + "rewards/margins": 10.567517280578613, + "rewards/rejected": -11.64561939239502, + "step": 12860 + }, + { + "epoch": 0.77, + "learning_rate": 7.794816244311526e-07, + "logits/chosen": -2.6251273155212402, + "logits/rejected": -2.2273685932159424, + "logps/chosen": -173.90574645996094, + "logps/rejected": -1335.7841796875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0581132173538208, + "rewards/margins": 11.862656593322754, + "rewards/rejected": -12.920770645141602, + "step": 12870 + }, + { + "epoch": 0.77, + "learning_rate": 7.757099789368663e-07, + "logits/chosen": -2.6846981048583984, + "logits/rejected": -2.2969648838043213, + "logps/chosen": -157.6529998779297, + "logps/rejected": -1178.612060546875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8531044125556946, + "rewards/margins": 10.486659049987793, + "rewards/rejected": -11.339765548706055, + "step": 12880 + }, + { + "epoch": 0.77, + "learning_rate": 7.7194580409893e-07, + "logits/chosen": -2.627392292022705, + "logits/rejected": -2.2261767387390137, + "logps/chosen": -177.13339233398438, + "logps/rejected": -1211.6153564453125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0407990217208862, + "rewards/margins": 10.635047912597656, + "rewards/rejected": -11.675847053527832, + "step": 12890 + }, + { + "epoch": 0.77, + "learning_rate": 7.681891162260016e-07, + "logits/chosen": -2.674931764602661, + "logits/rejected": -2.2001960277557373, + "logps/chosen": -184.78048706054688, + "logps/rejected": -1247.32763671875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0805585384368896, + "rewards/margins": 10.953813552856445, + "rewards/rejected": -12.034372329711914, + "step": 12900 + }, + { + "epoch": 0.77, + "learning_rate": 7.644399315943016e-07, + "logits/chosen": -2.6839587688446045, + "logits/rejected": -2.255333185195923, + "logps/chosen": -155.64060974121094, + "logps/rejected": -1321.2723388671875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.904283881187439, + "rewards/margins": 11.863615989685059, + "rewards/rejected": -12.767901420593262, + "step": 12910 + }, + { + "epoch": 0.77, + "learning_rate": 7.606982664475421e-07, + "logits/chosen": -2.714613199234009, + "logits/rejected": -2.2923293113708496, + "logps/chosen": -214.0587921142578, + "logps/rejected": -1296.0406494140625, + "loss": 0.0397, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.398858666419983, + "rewards/margins": 11.094388008117676, + "rewards/rejected": -12.493246078491211, + "step": 12920 + }, + { + "epoch": 0.77, + "learning_rate": 7.569641369968539e-07, + "logits/chosen": -2.6917829513549805, + "logits/rejected": -2.2468316555023193, + "logps/chosen": -170.21600341796875, + "logps/rejected": -1269.312744140625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9905278086662292, + "rewards/margins": 11.267845153808594, + "rewards/rejected": -12.258373260498047, + "step": 12930 + }, + { + "epoch": 0.77, + "learning_rate": 7.532375594207236e-07, + "logits/chosen": -2.6901912689208984, + "logits/rejected": -2.180147409439087, + "logps/chosen": -152.74215698242188, + "logps/rejected": -1350.9908447265625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7753187417984009, + "rewards/margins": 12.291214942932129, + "rewards/rejected": -13.066534042358398, + "step": 12940 + }, + { + "epoch": 0.77, + "learning_rate": 7.495185498649132e-07, + "logits/chosen": -2.669287919998169, + "logits/rejected": -2.2561023235321045, + "logps/chosen": -170.34580993652344, + "logps/rejected": -1266.3555908203125, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0135958194732666, + "rewards/margins": 11.203359603881836, + "rewards/rejected": -12.216954231262207, + "step": 12950 + }, + { + "epoch": 0.77, + "learning_rate": 7.45807124442399e-07, + "logits/chosen": -2.6803717613220215, + "logits/rejected": -2.2732787132263184, + "logps/chosen": -174.05111694335938, + "logps/rejected": -1233.6781005859375, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.003154993057251, + "rewards/margins": 10.904004096984863, + "rewards/rejected": -11.907158851623535, + "step": 12960 + }, + { + "epoch": 0.77, + "learning_rate": 7.421032992332967e-07, + "logits/chosen": -2.6921281814575195, + "logits/rejected": -2.278663158416748, + "logps/chosen": -174.88394165039062, + "logps/rejected": -1194.1551513671875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0726388692855835, + "rewards/margins": 10.44238567352295, + "rewards/rejected": -11.51502513885498, + "step": 12970 + }, + { + "epoch": 0.77, + "learning_rate": 7.384070902847943e-07, + "logits/chosen": -2.683811902999878, + "logits/rejected": -2.2557904720306396, + "logps/chosen": -167.99618530273438, + "logps/rejected": -1221.670654296875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9177943468093872, + "rewards/margins": 10.854341506958008, + "rewards/rejected": -11.772135734558105, + "step": 12980 + }, + { + "epoch": 0.77, + "learning_rate": 7.347185136110808e-07, + "logits/chosen": -2.6894168853759766, + "logits/rejected": -2.2807250022888184, + "logps/chosen": -185.720947265625, + "logps/rejected": -1281.350341796875, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1329227685928345, + "rewards/margins": 11.245623588562012, + "rewards/rejected": -12.378546714782715, + "step": 12990 + }, + { + "epoch": 0.78, + "learning_rate": 7.31037585193278e-07, + "logits/chosen": -2.623938798904419, + "logits/rejected": -2.2227213382720947, + "logps/chosen": -168.48818969726562, + "logps/rejected": -1299.114501953125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9466837048530579, + "rewards/margins": 11.610816955566406, + "rewards/rejected": -12.557500839233398, + "step": 13000 + }, + { + "epoch": 0.78, + "learning_rate": 7.273643209793719e-07, + "logits/chosen": -2.6866536140441895, + "logits/rejected": -2.2167487144470215, + "logps/chosen": -159.50753784179688, + "logps/rejected": -1251.9810791015625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.847709059715271, + "rewards/margins": 11.217992782592773, + "rewards/rejected": -12.065701484680176, + "step": 13010 + }, + { + "epoch": 0.78, + "learning_rate": 7.236987368841386e-07, + "logits/chosen": -2.637845039367676, + "logits/rejected": -2.1932952404022217, + "logps/chosen": -179.9394073486328, + "logps/rejected": -1110.22265625, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1683967113494873, + "rewards/margins": 9.51075553894043, + "rewards/rejected": -10.679153442382812, + "step": 13020 + }, + { + "epoch": 0.78, + "learning_rate": 7.200408487890859e-07, + "logits/chosen": -2.7048897743225098, + "logits/rejected": -2.2351491451263428, + "logps/chosen": -165.5768280029297, + "logps/rejected": -1252.8206787109375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9825845956802368, + "rewards/margins": 11.100459098815918, + "rewards/rejected": -12.083043098449707, + "step": 13030 + }, + { + "epoch": 0.78, + "learning_rate": 7.163906725423717e-07, + "logits/chosen": -2.678225040435791, + "logits/rejected": -2.2554025650024414, + "logps/chosen": -163.45361328125, + "logps/rejected": -1187.052978515625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9612115621566772, + "rewards/margins": 10.474334716796875, + "rewards/rejected": -11.435545921325684, + "step": 13040 + }, + { + "epoch": 0.78, + "learning_rate": 7.127482239587449e-07, + "logits/chosen": -2.6812033653259277, + "logits/rejected": -2.2906978130340576, + "logps/chosen": -183.73699951171875, + "logps/rejected": -1277.0086669921875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.16994309425354, + "rewards/margins": 11.151533126831055, + "rewards/rejected": -12.3214750289917, + "step": 13050 + }, + { + "epoch": 0.78, + "learning_rate": 7.091135188194729e-07, + "logits/chosen": -2.6475512981414795, + "logits/rejected": -2.125187397003174, + "logps/chosen": -162.56509399414062, + "logps/rejected": -1275.4913330078125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9636369943618774, + "rewards/margins": 11.352842330932617, + "rewards/rejected": -12.316479682922363, + "step": 13060 + }, + { + "epoch": 0.78, + "learning_rate": 7.054865728722732e-07, + "logits/chosen": -2.695511817932129, + "logits/rejected": -2.306565761566162, + "logps/chosen": -179.28182983398438, + "logps/rejected": -1256.1376953125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1137334108352661, + "rewards/margins": 11.013031959533691, + "rewards/rejected": -12.126764297485352, + "step": 13070 + }, + { + "epoch": 0.78, + "learning_rate": 7.018674018312468e-07, + "logits/chosen": -2.700685977935791, + "logits/rejected": -2.23781156539917, + "logps/chosen": -173.95602416992188, + "logps/rejected": -1210.769775390625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9836994409561157, + "rewards/margins": 10.681673049926758, + "rewards/rejected": -11.66537094116211, + "step": 13080 + }, + { + "epoch": 0.78, + "learning_rate": 6.982560213768088e-07, + "logits/chosen": -2.6668457984924316, + "logits/rejected": -2.3130152225494385, + "logps/chosen": -196.2173614501953, + "logps/rejected": -1350.684814453125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.272918462753296, + "rewards/margins": 11.78793716430664, + "rewards/rejected": -13.0608549118042, + "step": 13090 + }, + { + "epoch": 0.78, + "learning_rate": 6.946524471556212e-07, + "logits/chosen": -2.642522096633911, + "logits/rejected": -2.2591443061828613, + "logps/chosen": -210.7269287109375, + "logps/rejected": -1289.4095458984375, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4042508602142334, + "rewards/margins": 11.056662559509277, + "rewards/rejected": -12.46091365814209, + "step": 13100 + }, + { + "epoch": 0.78, + "learning_rate": 6.91056694780522e-07, + "logits/chosen": -2.6940157413482666, + "logits/rejected": -2.279496192932129, + "logps/chosen": -201.9541778564453, + "logps/rejected": -1185.190185546875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2740415334701538, + "rewards/margins": 10.136459350585938, + "rewards/rejected": -11.410501480102539, + "step": 13110 + }, + { + "epoch": 0.78, + "learning_rate": 6.874687798304657e-07, + "logits/chosen": -2.677957773208618, + "logits/rejected": -2.2969970703125, + "logps/chosen": -170.20960998535156, + "logps/rejected": -1304.2730712890625, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0024642944335938, + "rewards/margins": 11.60912799835205, + "rewards/rejected": -12.611592292785645, + "step": 13120 + }, + { + "epoch": 0.78, + "learning_rate": 6.83888717850445e-07, + "logits/chosen": -2.708543539047241, + "logits/rejected": -2.2202653884887695, + "logps/chosen": -172.69029235839844, + "logps/rejected": -1262.724365234375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9483838081359863, + "rewards/margins": 11.248501777648926, + "rewards/rejected": -12.196885108947754, + "step": 13130 + }, + { + "epoch": 0.78, + "learning_rate": 6.803165243514315e-07, + "logits/chosen": -2.67868971824646, + "logits/rejected": -2.2132647037506104, + "logps/chosen": -179.4116973876953, + "logps/rejected": -1266.4095458984375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0981104373931885, + "rewards/margins": 11.128485679626465, + "rewards/rejected": -12.22659683227539, + "step": 13140 + }, + { + "epoch": 0.78, + "learning_rate": 6.767522148103054e-07, + "logits/chosen": -2.6488633155822754, + "logits/rejected": -2.2347731590270996, + "logps/chosen": -188.84371948242188, + "logps/rejected": -1151.799560546875, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1771914958953857, + "rewards/margins": 9.915623664855957, + "rewards/rejected": -11.092815399169922, + "step": 13150 + }, + { + "epoch": 0.78, + "learning_rate": 6.731958046697893e-07, + "logits/chosen": -2.726508140563965, + "logits/rejected": -2.3081257343292236, + "logps/chosen": -173.83941650390625, + "logps/rejected": -1308.51123046875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9566122889518738, + "rewards/margins": 11.688125610351562, + "rewards/rejected": -12.64473819732666, + "step": 13160 + }, + { + "epoch": 0.79, + "learning_rate": 6.696473093383798e-07, + "logits/chosen": -2.6651060581207275, + "logits/rejected": -2.238739252090454, + "logps/chosen": -191.33001708984375, + "logps/rejected": -1191.375244140625, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1960890293121338, + "rewards/margins": 10.284440040588379, + "rewards/rejected": -11.480527877807617, + "step": 13170 + }, + { + "epoch": 0.79, + "learning_rate": 6.66106744190283e-07, + "logits/chosen": -2.6319127082824707, + "logits/rejected": -2.1692593097686768, + "logps/chosen": -163.61129760742188, + "logps/rejected": -1328.734130859375, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9775162935256958, + "rewards/margins": 11.859718322753906, + "rewards/rejected": -12.837234497070312, + "step": 13180 + }, + { + "epoch": 0.79, + "learning_rate": 6.625741245653466e-07, + "logits/chosen": -2.6469409465789795, + "logits/rejected": -2.295015811920166, + "logps/chosen": -170.31234741210938, + "logps/rejected": -1135.0689697265625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.028408408164978, + "rewards/margins": 9.866082191467285, + "rewards/rejected": -10.894491195678711, + "step": 13190 + }, + { + "epoch": 0.79, + "learning_rate": 6.590494657689909e-07, + "logits/chosen": -2.6353957653045654, + "logits/rejected": -2.253103733062744, + "logps/chosen": -153.44149780273438, + "logps/rejected": -1347.284423828125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8604599833488464, + "rewards/margins": 12.169416427612305, + "rewards/rejected": -13.029878616333008, + "step": 13200 + }, + { + "epoch": 0.79, + "learning_rate": 6.5553278307215e-07, + "logits/chosen": -2.728541851043701, + "logits/rejected": -2.20381498336792, + "logps/chosen": -166.08578491210938, + "logps/rejected": -1143.349853515625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.952092170715332, + "rewards/margins": 10.038162231445312, + "rewards/rejected": -10.990252494812012, + "step": 13210 + }, + { + "epoch": 0.79, + "learning_rate": 6.520240917111961e-07, + "logits/chosen": -2.651829242706299, + "logits/rejected": -2.2207236289978027, + "logps/chosen": -177.0665283203125, + "logps/rejected": -1300.932861328125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0621365308761597, + "rewards/margins": 11.499796867370605, + "rewards/rejected": -12.561933517456055, + "step": 13220 + }, + { + "epoch": 0.79, + "learning_rate": 6.485234068878809e-07, + "logits/chosen": -2.6543197631835938, + "logits/rejected": -2.2154428958892822, + "logps/chosen": -163.3125, + "logps/rejected": -1217.4219970703125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9492557644844055, + "rewards/margins": 10.781808853149414, + "rewards/rejected": -11.731064796447754, + "step": 13230 + }, + { + "epoch": 0.79, + "learning_rate": 6.450307437692663e-07, + "logits/chosen": -2.680445432662964, + "logits/rejected": -2.251739025115967, + "logps/chosen": -174.94589233398438, + "logps/rejected": -1278.713134765625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0443763732910156, + "rewards/margins": 11.288985252380371, + "rewards/rejected": -12.333361625671387, + "step": 13240 + }, + { + "epoch": 0.79, + "learning_rate": 6.415461174876589e-07, + "logits/chosen": -2.6188805103302, + "logits/rejected": -2.1884093284606934, + "logps/chosen": -207.90133666992188, + "logps/rejected": -1085.8309326171875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3835684061050415, + "rewards/margins": 9.021151542663574, + "rewards/rejected": -10.4047212600708, + "step": 13250 + }, + { + "epoch": 0.79, + "learning_rate": 6.380695431405453e-07, + "logits/chosen": -2.7068963050842285, + "logits/rejected": -2.1892426013946533, + "logps/chosen": -182.9019317626953, + "logps/rejected": -1269.402587890625, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0909157991409302, + "rewards/margins": 11.161664962768555, + "rewards/rejected": -12.252579689025879, + "step": 13260 + }, + { + "epoch": 0.79, + "learning_rate": 6.346010357905269e-07, + "logits/chosen": -2.653012752532959, + "logits/rejected": -2.1631197929382324, + "logps/chosen": -165.99468994140625, + "logps/rejected": -1274.478515625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9044996500015259, + "rewards/margins": 11.387828826904297, + "rewards/rejected": -12.292328834533691, + "step": 13270 + }, + { + "epoch": 0.79, + "learning_rate": 6.311406104652534e-07, + "logits/chosen": -2.6233255863189697, + "logits/rejected": -2.1972603797912598, + "logps/chosen": -160.0179443359375, + "logps/rejected": -1298.3896484375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9248750805854797, + "rewards/margins": 11.606918334960938, + "rewards/rejected": -12.531793594360352, + "step": 13280 + }, + { + "epoch": 0.79, + "learning_rate": 6.276882821573566e-07, + "logits/chosen": -2.678894519805908, + "logits/rejected": -2.205015182495117, + "logps/chosen": -174.76361083984375, + "logps/rejected": -1205.548095703125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9612113833427429, + "rewards/margins": 10.646207809448242, + "rewards/rejected": -11.607417106628418, + "step": 13290 + }, + { + "epoch": 0.79, + "learning_rate": 6.242440658243915e-07, + "logits/chosen": -2.7096359729766846, + "logits/rejected": -2.2825987339019775, + "logps/chosen": -188.0618438720703, + "logps/rejected": -1294.4810791015625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1275831460952759, + "rewards/margins": 11.366848945617676, + "rewards/rejected": -12.494430541992188, + "step": 13300 + }, + { + "epoch": 0.79, + "learning_rate": 6.208079763887626e-07, + "logits/chosen": -2.647400379180908, + "logits/rejected": -2.227109432220459, + "logps/chosen": -169.544677734375, + "logps/rejected": -1213.8255615234375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9902595281600952, + "rewards/margins": 10.696873664855957, + "rewards/rejected": -11.6871337890625, + "step": 13310 + }, + { + "epoch": 0.79, + "learning_rate": 6.173800287376669e-07, + "logits/chosen": -2.669236183166504, + "logits/rejected": -2.142890453338623, + "logps/chosen": -174.83187866210938, + "logps/rejected": -1341.41552734375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0087352991104126, + "rewards/margins": 11.962529182434082, + "rewards/rejected": -12.97126579284668, + "step": 13320 + }, + { + "epoch": 0.79, + "learning_rate": 6.139602377230247e-07, + "logits/chosen": -2.695554494857788, + "logits/rejected": -2.2485549449920654, + "logps/chosen": -172.8942108154297, + "logps/rejected": -1233.6075439453125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0706312656402588, + "rewards/margins": 10.822290420532227, + "rewards/rejected": -11.892923355102539, + "step": 13330 + }, + { + "epoch": 0.8, + "learning_rate": 6.105486181614176e-07, + "logits/chosen": -2.6472957134246826, + "logits/rejected": -2.2596423625946045, + "logps/chosen": -183.82476806640625, + "logps/rejected": -1218.6607666015625, + "loss": 0.0232, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1191326379776, + "rewards/margins": 10.635268211364746, + "rewards/rejected": -11.754400253295898, + "step": 13340 + }, + { + "epoch": 0.8, + "learning_rate": 6.071451848340235e-07, + "logits/chosen": -2.716681480407715, + "logits/rejected": -2.3031864166259766, + "logps/chosen": -183.01229858398438, + "logps/rejected": -1330.8768310546875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0952812433242798, + "rewards/margins": 11.753024101257324, + "rewards/rejected": -12.848306655883789, + "step": 13350 + }, + { + "epoch": 0.8, + "learning_rate": 6.037499524865523e-07, + "logits/chosen": -2.679293632507324, + "logits/rejected": -2.2975120544433594, + "logps/chosen": -196.26878356933594, + "logps/rejected": -1164.00634765625, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2833541631698608, + "rewards/margins": 9.899324417114258, + "rewards/rejected": -11.182679176330566, + "step": 13360 + }, + { + "epoch": 0.8, + "learning_rate": 6.003629358291832e-07, + "logits/chosen": -2.650348663330078, + "logits/rejected": -2.17437481880188, + "logps/chosen": -179.01318359375, + "logps/rejected": -1338.89208984375, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1230435371398926, + "rewards/margins": 11.820822715759277, + "rewards/rejected": -12.943865776062012, + "step": 13370 + }, + { + "epoch": 0.8, + "learning_rate": 5.969841495364978e-07, + "logits/chosen": -2.706183910369873, + "logits/rejected": -2.1938719749450684, + "logps/chosen": -171.48179626464844, + "logps/rejected": -1221.214111328125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0038444995880127, + "rewards/margins": 10.76405143737793, + "rewards/rejected": -11.76789665222168, + "step": 13380 + }, + { + "epoch": 0.8, + "learning_rate": 5.936136082474228e-07, + "logits/chosen": -2.647810459136963, + "logits/rejected": -2.2589077949523926, + "logps/chosen": -192.23797607421875, + "logps/rejected": -1151.644287109375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2304718494415283, + "rewards/margins": 9.834982872009277, + "rewards/rejected": -11.065455436706543, + "step": 13390 + }, + { + "epoch": 0.8, + "learning_rate": 5.902513265651585e-07, + "logits/chosen": -2.6624646186828613, + "logits/rejected": -2.2934396266937256, + "logps/chosen": -199.35960388183594, + "logps/rejected": -1224.0863037109375, + "loss": 0.0485, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.2951158285140991, + "rewards/margins": 10.49374771118164, + "rewards/rejected": -11.788863182067871, + "step": 13400 + }, + { + "epoch": 0.8, + "learning_rate": 5.868973190571214e-07, + "logits/chosen": -2.6969449520111084, + "logits/rejected": -2.2601099014282227, + "logps/chosen": -175.25234985351562, + "logps/rejected": -1302.220458984375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0468733310699463, + "rewards/margins": 11.517797470092773, + "rewards/rejected": -12.564672470092773, + "step": 13410 + }, + { + "epoch": 0.8, + "learning_rate": 5.835516002548816e-07, + "logits/chosen": -2.676764488220215, + "logits/rejected": -2.2897772789001465, + "logps/chosen": -166.84677124023438, + "logps/rejected": -1289.373779296875, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9588912129402161, + "rewards/margins": 11.490133285522461, + "rewards/rejected": -12.44902515411377, + "step": 13420 + }, + { + "epoch": 0.8, + "learning_rate": 5.802141846540932e-07, + "logits/chosen": -2.636695384979248, + "logits/rejected": -2.1682748794555664, + "logps/chosen": -171.29469299316406, + "logps/rejected": -1207.435791015625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0376616716384888, + "rewards/margins": 10.577069282531738, + "rewards/rejected": -11.614728927612305, + "step": 13430 + }, + { + "epoch": 0.8, + "learning_rate": 5.768850867144385e-07, + "logits/chosen": -2.6255812644958496, + "logits/rejected": -2.207350254058838, + "logps/chosen": -168.15487670898438, + "logps/rejected": -1215.546142578125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9339563250541687, + "rewards/margins": 10.77913761138916, + "rewards/rejected": -11.713093757629395, + "step": 13440 + }, + { + "epoch": 0.8, + "learning_rate": 5.735643208595623e-07, + "logits/chosen": -2.6327404975891113, + "logits/rejected": -2.2287228107452393, + "logps/chosen": -177.02865600585938, + "logps/rejected": -1245.131591796875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0770198106765747, + "rewards/margins": 10.917552947998047, + "rewards/rejected": -11.994571685791016, + "step": 13450 + }, + { + "epoch": 0.8, + "learning_rate": 5.702519014770108e-07, + "logits/chosen": -2.640324115753174, + "logits/rejected": -2.2754924297332764, + "logps/chosen": -198.15188598632812, + "logps/rejected": -1141.55712890625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2626521587371826, + "rewards/margins": 9.7100191116333, + "rewards/rejected": -10.972671508789062, + "step": 13460 + }, + { + "epoch": 0.8, + "learning_rate": 5.669478429181646e-07, + "logits/chosen": -2.681659698486328, + "logits/rejected": -2.305046558380127, + "logps/chosen": -168.178955078125, + "logps/rejected": -1214.56494140625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9352678060531616, + "rewards/margins": 10.766515731811523, + "rewards/rejected": -11.701784133911133, + "step": 13470 + }, + { + "epoch": 0.8, + "learning_rate": 5.636521594981851e-07, + "logits/chosen": -2.719573497772217, + "logits/rejected": -2.2618885040283203, + "logps/chosen": -156.7507781982422, + "logps/rejected": -1208.9212646484375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9281970858573914, + "rewards/margins": 10.716633796691895, + "rewards/rejected": -11.644829750061035, + "step": 13480 + }, + { + "epoch": 0.8, + "learning_rate": 5.603648654959454e-07, + "logits/chosen": -2.657705307006836, + "logits/rejected": -2.1829946041107178, + "logps/chosen": -190.27902221679688, + "logps/rejected": -1258.4146728515625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.131217360496521, + "rewards/margins": 11.015460014343262, + "rewards/rejected": -12.146677017211914, + "step": 13490 + }, + { + "epoch": 0.81, + "learning_rate": 5.570859751539687e-07, + "logits/chosen": -2.707965612411499, + "logits/rejected": -2.179104804992676, + "logps/chosen": -166.4765625, + "logps/rejected": -1183.0833740234375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8997961282730103, + "rewards/margins": 10.489396095275879, + "rewards/rejected": -11.389192581176758, + "step": 13500 + }, + { + "epoch": 0.81, + "learning_rate": 5.538155026783726e-07, + "logits/chosen": -2.6481680870056152, + "logits/rejected": -2.203890323638916, + "logps/chosen": -216.88485717773438, + "logps/rejected": -1153.209716796875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4733598232269287, + "rewards/margins": 9.611236572265625, + "rewards/rejected": -11.084596633911133, + "step": 13510 + }, + { + "epoch": 0.81, + "learning_rate": 5.505534622387998e-07, + "logits/chosen": -2.6645545959472656, + "logits/rejected": -2.2244956493377686, + "logps/chosen": -176.46615600585938, + "logps/rejected": -1325.2359619140625, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9905093908309937, + "rewards/margins": 11.800531387329102, + "rewards/rejected": -12.791041374206543, + "step": 13520 + }, + { + "epoch": 0.81, + "learning_rate": 5.472998679683619e-07, + "logits/chosen": -2.704134702682495, + "logits/rejected": -2.2616806030273438, + "logps/chosen": -211.95303344726562, + "logps/rejected": -1289.783203125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.371867299079895, + "rewards/margins": 11.083040237426758, + "rewards/rejected": -12.454907417297363, + "step": 13530 + }, + { + "epoch": 0.81, + "learning_rate": 5.440547339635769e-07, + "logits/chosen": -2.6759040355682373, + "logits/rejected": -2.2015576362609863, + "logps/chosen": -175.56509399414062, + "logps/rejected": -1367.412353515625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0149471759796143, + "rewards/margins": 12.2113037109375, + "rewards/rejected": -13.226249694824219, + "step": 13540 + }, + { + "epoch": 0.81, + "learning_rate": 5.408180742843069e-07, + "logits/chosen": -2.673353672027588, + "logits/rejected": -2.2181715965270996, + "logps/chosen": -175.46861267089844, + "logps/rejected": -1311.572509765625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0530420541763306, + "rewards/margins": 11.617881774902344, + "rewards/rejected": -12.67092514038086, + "step": 13550 + }, + { + "epoch": 0.81, + "learning_rate": 5.375899029536996e-07, + "logits/chosen": -2.6008431911468506, + "logits/rejected": -2.196563243865967, + "logps/chosen": -158.3331756591797, + "logps/rejected": -1320.8887939453125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8840807676315308, + "rewards/margins": 11.880335807800293, + "rewards/rejected": -12.764415740966797, + "step": 13560 + }, + { + "epoch": 0.81, + "learning_rate": 5.34370233958125e-07, + "logits/chosen": -2.7051897048950195, + "logits/rejected": -2.218392848968506, + "logps/chosen": -176.99301147460938, + "logps/rejected": -1282.6273193359375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0912928581237793, + "rewards/margins": 11.286653518676758, + "rewards/rejected": -12.377946853637695, + "step": 13570 + }, + { + "epoch": 0.81, + "learning_rate": 5.311590812471165e-07, + "logits/chosen": -2.680466413497925, + "logits/rejected": -2.2152719497680664, + "logps/chosen": -189.317138671875, + "logps/rejected": -1248.241943359375, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2016308307647705, + "rewards/margins": 10.841754913330078, + "rewards/rejected": -12.043386459350586, + "step": 13580 + }, + { + "epoch": 0.81, + "learning_rate": 5.279564587333077e-07, + "logits/chosen": -2.6547799110412598, + "logits/rejected": -2.2106080055236816, + "logps/chosen": -176.56942749023438, + "logps/rejected": -1225.6854248046875, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1161929368972778, + "rewards/margins": 10.699230194091797, + "rewards/rejected": -11.815423965454102, + "step": 13590 + }, + { + "epoch": 0.81, + "learning_rate": 5.247623802923788e-07, + "logits/chosen": -2.699678421020508, + "logits/rejected": -2.291386842727661, + "logps/chosen": -172.39993286132812, + "logps/rejected": -1277.2335205078125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.028847336769104, + "rewards/margins": 11.290389060974121, + "rewards/rejected": -12.319235801696777, + "step": 13600 + }, + { + "epoch": 0.81, + "learning_rate": 5.215768597629872e-07, + "logits/chosen": -2.7503225803375244, + "logits/rejected": -2.293412446975708, + "logps/chosen": -165.2312469482422, + "logps/rejected": -1258.467529296875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9487820863723755, + "rewards/margins": 11.192811965942383, + "rewards/rejected": -12.141592025756836, + "step": 13610 + }, + { + "epoch": 0.81, + "learning_rate": 5.18399910946715e-07, + "logits/chosen": -2.6760306358337402, + "logits/rejected": -2.1967055797576904, + "logps/chosen": -157.71188354492188, + "logps/rejected": -1231.4622802734375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.835241973400116, + "rewards/margins": 11.033151626586914, + "rewards/rejected": -11.86839485168457, + "step": 13620 + }, + { + "epoch": 0.81, + "learning_rate": 5.152315476080058e-07, + "logits/chosen": -2.7032432556152344, + "logits/rejected": -2.1931357383728027, + "logps/chosen": -162.85061645507812, + "logps/rejected": -1246.513916015625, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9287565350532532, + "rewards/margins": 11.081074714660645, + "rewards/rejected": -12.009831428527832, + "step": 13630 + }, + { + "epoch": 0.81, + "learning_rate": 5.12071783474106e-07, + "logits/chosen": -2.622471809387207, + "logits/rejected": -2.254931688308716, + "logps/chosen": -170.14712524414062, + "logps/rejected": -1179.711669921875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0697211027145386, + "rewards/margins": 10.290043830871582, + "rewards/rejected": -11.359764099121094, + "step": 13640 + }, + { + "epoch": 0.81, + "learning_rate": 5.089206322350046e-07, + "logits/chosen": -2.5998706817626953, + "logits/rejected": -2.248945713043213, + "logps/chosen": -195.8653106689453, + "logps/rejected": -1266.3809814453125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.266817331314087, + "rewards/margins": 10.947492599487305, + "rewards/rejected": -12.214308738708496, + "step": 13650 + }, + { + "epoch": 0.81, + "learning_rate": 5.057781075433751e-07, + "logits/chosen": -2.671738386154175, + "logits/rejected": -2.2132976055145264, + "logps/chosen": -172.5011444091797, + "logps/rejected": -1306.454833984375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0415641069412231, + "rewards/margins": 11.572668075561523, + "rewards/rejected": -12.614233016967773, + "step": 13660 + }, + { + "epoch": 0.82, + "learning_rate": 5.026442230145157e-07, + "logits/chosen": -2.7238144874572754, + "logits/rejected": -2.311789035797119, + "logps/chosen": -169.64390563964844, + "logps/rejected": -1288.0789794921875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9923800230026245, + "rewards/margins": 11.448450088500977, + "rewards/rejected": -12.440828323364258, + "step": 13670 + }, + { + "epoch": 0.82, + "learning_rate": 4.995189922262877e-07, + "logits/chosen": -2.6588640213012695, + "logits/rejected": -2.228370189666748, + "logps/chosen": -159.25311279296875, + "logps/rejected": -1209.178466796875, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8602941632270813, + "rewards/margins": 10.785550117492676, + "rewards/rejected": -11.645845413208008, + "step": 13680 + }, + { + "epoch": 0.82, + "learning_rate": 4.964024287190644e-07, + "logits/chosen": -2.7045180797576904, + "logits/rejected": -2.1723971366882324, + "logps/chosen": -166.21282958984375, + "logps/rejected": -1224.7825927734375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0211451053619385, + "rewards/margins": 10.764714241027832, + "rewards/rejected": -11.785860061645508, + "step": 13690 + }, + { + "epoch": 0.82, + "learning_rate": 4.932945459956617e-07, + "logits/chosen": -2.692246675491333, + "logits/rejected": -2.257985830307007, + "logps/chosen": -166.0492401123047, + "logps/rejected": -1306.399169921875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9635535478591919, + "rewards/margins": 11.66749382019043, + "rewards/rejected": -12.631048202514648, + "step": 13700 + }, + { + "epoch": 0.82, + "learning_rate": 4.901953575212884e-07, + "logits/chosen": -2.6288509368896484, + "logits/rejected": -2.167935609817505, + "logps/chosen": -177.33889770507812, + "logps/rejected": -1239.9371337890625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1235032081604004, + "rewards/margins": 10.83365249633789, + "rewards/rejected": -11.957155227661133, + "step": 13710 + }, + { + "epoch": 0.82, + "learning_rate": 4.87104876723484e-07, + "logits/chosen": -2.680795192718506, + "logits/rejected": -2.2613396644592285, + "logps/chosen": -181.4055633544922, + "logps/rejected": -1192.895263671875, + "loss": 0.0353, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1026599407196045, + "rewards/margins": 10.384865760803223, + "rewards/rejected": -11.487527847290039, + "step": 13720 + }, + { + "epoch": 0.82, + "learning_rate": 4.840231169920609e-07, + "logits/chosen": -2.6946616172790527, + "logits/rejected": -2.1802120208740234, + "logps/chosen": -183.39422607421875, + "logps/rejected": -1199.854248046875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0639870166778564, + "rewards/margins": 10.495994567871094, + "rewards/rejected": -11.559983253479004, + "step": 13730 + }, + { + "epoch": 0.82, + "learning_rate": 4.809500916790466e-07, + "logits/chosen": -2.661583185195923, + "logits/rejected": -2.217468500137329, + "logps/chosen": -163.66004943847656, + "logps/rejected": -1271.381591796875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8894637227058411, + "rewards/margins": 11.384147644042969, + "rewards/rejected": -12.273612022399902, + "step": 13740 + }, + { + "epoch": 0.82, + "learning_rate": 4.778858140986259e-07, + "logits/chosen": -2.6748485565185547, + "logits/rejected": -2.1921467781066895, + "logps/chosen": -166.37962341308594, + "logps/rejected": -1257.1416015625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9459733963012695, + "rewards/margins": 11.175190925598145, + "rewards/rejected": -12.12116527557373, + "step": 13750 + }, + { + "epoch": 0.82, + "learning_rate": 4.748302975270838e-07, + "logits/chosen": -2.6516661643981934, + "logits/rejected": -2.244601011276245, + "logps/chosen": -177.70980834960938, + "logps/rejected": -1294.5078125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0942624807357788, + "rewards/margins": 11.416128158569336, + "rewards/rejected": -12.510391235351562, + "step": 13760 + }, + { + "epoch": 0.82, + "learning_rate": 4.71783555202745e-07, + "logits/chosen": -2.65417742729187, + "logits/rejected": -2.1740970611572266, + "logps/chosen": -171.97743225097656, + "logps/rejected": -1268.628662109375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0818229913711548, + "rewards/margins": 11.163043022155762, + "rewards/rejected": -12.244867324829102, + "step": 13770 + }, + { + "epoch": 0.82, + "learning_rate": 4.6874560032592333e-07, + "logits/chosen": -2.6474406719207764, + "logits/rejected": -2.3152217864990234, + "logps/chosen": -168.77127075195312, + "logps/rejected": -1212.509033203125, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0077855587005615, + "rewards/margins": 10.671684265136719, + "rewards/rejected": -11.67947006225586, + "step": 13780 + }, + { + "epoch": 0.82, + "learning_rate": 4.6571644605885565e-07, + "logits/chosen": -2.683969736099243, + "logits/rejected": -2.2438387870788574, + "logps/chosen": -176.6464080810547, + "logps/rejected": -1315.1319580078125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0864895582199097, + "rewards/margins": 11.621270179748535, + "rewards/rejected": -12.707758903503418, + "step": 13790 + }, + { + "epoch": 0.82, + "learning_rate": 4.6269610552565153e-07, + "logits/chosen": -2.636932849884033, + "logits/rejected": -2.1782472133636475, + "logps/chosen": -172.1649627685547, + "logps/rejected": -1294.2266845703125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9745909571647644, + "rewards/margins": 11.520150184631348, + "rewards/rejected": -12.494741439819336, + "step": 13800 + }, + { + "epoch": 0.82, + "learning_rate": 4.5968459181223416e-07, + "logits/chosen": -2.6854310035705566, + "logits/rejected": -2.282257556915283, + "logps/chosen": -176.67733764648438, + "logps/rejected": -1218.510498046875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0794392824172974, + "rewards/margins": 10.672860145568848, + "rewards/rejected": -11.752298355102539, + "step": 13810 + }, + { + "epoch": 0.82, + "learning_rate": 4.566819179662829e-07, + "logits/chosen": -2.6704695224761963, + "logits/rejected": -2.292807102203369, + "logps/chosen": -178.35671997070312, + "logps/rejected": -1292.9691162109375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0838291645050049, + "rewards/margins": 11.395403861999512, + "rewards/rejected": -12.479230880737305, + "step": 13820 + }, + { + "epoch": 0.82, + "learning_rate": 4.5368809699717855e-07, + "logits/chosen": -2.6712725162506104, + "logits/rejected": -2.286290407180786, + "logps/chosen": -178.18814086914062, + "logps/rejected": -1236.680908203125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0077872276306152, + "rewards/margins": 10.912640571594238, + "rewards/rejected": -11.920427322387695, + "step": 13830 + }, + { + "epoch": 0.83, + "learning_rate": 4.507031418759447e-07, + "logits/chosen": -2.704024314880371, + "logits/rejected": -2.2878246307373047, + "logps/chosen": -180.00888061523438, + "logps/rejected": -1279.6773681640625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.077067494392395, + "rewards/margins": 11.275760650634766, + "rewards/rejected": -12.352828979492188, + "step": 13840 + }, + { + "epoch": 0.83, + "learning_rate": 4.477270655351942e-07, + "logits/chosen": -2.6880481243133545, + "logits/rejected": -2.167849063873291, + "logps/chosen": -180.31747436523438, + "logps/rejected": -1329.6435546875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1320539712905884, + "rewards/margins": 11.714104652404785, + "rewards/rejected": -12.846158027648926, + "step": 13850 + }, + { + "epoch": 0.83, + "learning_rate": 4.447598808690695e-07, + "logits/chosen": -2.6901631355285645, + "logits/rejected": -2.1718173027038574, + "logps/chosen": -163.86790466308594, + "logps/rejected": -1238.12158203125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9505518674850464, + "rewards/margins": 10.998294830322266, + "rewards/rejected": -11.948846817016602, + "step": 13860 + }, + { + "epoch": 0.83, + "learning_rate": 4.418016007331924e-07, + "logits/chosen": -2.6914234161376953, + "logits/rejected": -2.282498836517334, + "logps/chosen": -165.16580200195312, + "logps/rejected": -1366.4530029296875, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9785376787185669, + "rewards/margins": 12.24502182006836, + "rewards/rejected": -13.223559379577637, + "step": 13870 + }, + { + "epoch": 0.83, + "learning_rate": 4.3885223794460114e-07, + "logits/chosen": -2.7076926231384277, + "logits/rejected": -2.2664225101470947, + "logps/chosen": -172.03372192382812, + "logps/rejected": -1235.4722900390625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9551488757133484, + "rewards/margins": 10.9685640335083, + "rewards/rejected": -11.923711776733398, + "step": 13880 + }, + { + "epoch": 0.83, + "learning_rate": 4.359118052817013e-07, + "logits/chosen": -2.6591410636901855, + "logits/rejected": -2.0986130237579346, + "logps/chosen": -194.7082061767578, + "logps/rejected": -1338.324462890625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2044440507888794, + "rewards/margins": 11.721646308898926, + "rewards/rejected": -12.926091194152832, + "step": 13890 + }, + { + "epoch": 0.83, + "learning_rate": 4.3298031548420716e-07, + "logits/chosen": -2.676812171936035, + "logits/rejected": -2.207137107849121, + "logps/chosen": -170.67684936523438, + "logps/rejected": -1282.3267822265625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.07079017162323, + "rewards/margins": 11.308879852294922, + "rewards/rejected": -12.379671096801758, + "step": 13900 + }, + { + "epoch": 0.83, + "learning_rate": 4.300577812530868e-07, + "logits/chosen": -2.654151678085327, + "logits/rejected": -2.201157331466675, + "logps/chosen": -180.61279296875, + "logps/rejected": -1209.297119140625, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1155108213424683, + "rewards/margins": 10.541376113891602, + "rewards/rejected": -11.656888961791992, + "step": 13910 + }, + { + "epoch": 0.83, + "learning_rate": 4.2714421525050734e-07, + "logits/chosen": -2.6808247566223145, + "logits/rejected": -2.247138500213623, + "logps/chosen": -184.03382873535156, + "logps/rejected": -1217.38134765625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0670932531356812, + "rewards/margins": 10.660669326782227, + "rewards/rejected": -11.727762222290039, + "step": 13920 + }, + { + "epoch": 0.83, + "learning_rate": 4.242396300997809e-07, + "logits/chosen": -2.693331480026245, + "logits/rejected": -2.2159297466278076, + "logps/chosen": -167.58206176757812, + "logps/rejected": -1344.5343017578125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9405523538589478, + "rewards/margins": 12.056676864624023, + "rewards/rejected": -12.997228622436523, + "step": 13930 + }, + { + "epoch": 0.83, + "learning_rate": 4.213440383853093e-07, + "logits/chosen": -2.6392228603363037, + "logits/rejected": -2.2534401416778564, + "logps/chosen": -156.3339385986328, + "logps/rejected": -1310.461669921875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8530643582344055, + "rewards/margins": 11.808847427368164, + "rewards/rejected": -12.661911010742188, + "step": 13940 + }, + { + "epoch": 0.83, + "learning_rate": 4.1845745265252673e-07, + "logits/chosen": -2.6900010108947754, + "logits/rejected": -2.2535605430603027, + "logps/chosen": -167.2605743408203, + "logps/rejected": -1268.8167724609375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0186363458633423, + "rewards/margins": 11.22164535522461, + "rewards/rejected": -12.24028205871582, + "step": 13950 + }, + { + "epoch": 0.83, + "learning_rate": 4.15579885407853e-07, + "logits/chosen": -2.6795239448547363, + "logits/rejected": -2.261195659637451, + "logps/chosen": -183.81129455566406, + "logps/rejected": -1348.705322265625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1615241765975952, + "rewards/margins": 11.869373321533203, + "rewards/rejected": -13.03089714050293, + "step": 13960 + }, + { + "epoch": 0.83, + "learning_rate": 4.1271134911862936e-07, + "logits/chosen": -2.6693310737609863, + "logits/rejected": -2.237502336502075, + "logps/chosen": -198.24862670898438, + "logps/rejected": -1274.082275390625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2623385190963745, + "rewards/margins": 11.021790504455566, + "rewards/rejected": -12.284128189086914, + "step": 13970 + }, + { + "epoch": 0.83, + "learning_rate": 4.0985185621307293e-07, + "logits/chosen": -2.6910958290100098, + "logits/rejected": -2.262298107147217, + "logps/chosen": -183.97500610351562, + "logps/rejected": -1292.0716552734375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1392381191253662, + "rewards/margins": 11.339223861694336, + "rewards/rejected": -12.478460311889648, + "step": 13980 + }, + { + "epoch": 0.83, + "learning_rate": 4.0700141908021793e-07, + "logits/chosen": -2.6785387992858887, + "logits/rejected": -2.215505838394165, + "logps/chosen": -199.35546875, + "logps/rejected": -1358.932373046875, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.240281343460083, + "rewards/margins": 11.902799606323242, + "rewards/rejected": -13.143081665039062, + "step": 13990 + }, + { + "epoch": 0.83, + "learning_rate": 4.041600500698642e-07, + "logits/chosen": -2.6829400062561035, + "logits/rejected": -2.223679304122925, + "logps/chosen": -188.1009979248047, + "logps/rejected": -1227.83740234375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2205641269683838, + "rewards/margins": 10.616205215454102, + "rewards/rejected": -11.836769104003906, + "step": 14000 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.6053109169006348, + "eval_logits/rejected": -2.383760452270508, + "eval_logps/chosen": -264.53240966796875, + "eval_logps/rejected": -1094.8602294921875, + "eval_loss": 0.002599479164928198, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -1.974579930305481, + "eval_rewards/margins": 8.518539428710938, + "eval_rewards/rejected": -10.493120193481445, + "eval_runtime": 3.9017, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.256, + "step": 14000 + }, + { + "epoch": 0.84, + "learning_rate": 4.013277614925229e-07, + "logits/chosen": -2.658773899078369, + "logits/rejected": -2.1468019485473633, + "logps/chosen": -179.64801025390625, + "logps/rejected": -1278.55908203125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0787864923477173, + "rewards/margins": 11.25596809387207, + "rewards/rejected": -12.33475399017334, + "step": 14010 + }, + { + "epoch": 0.84, + "learning_rate": 3.985045656193631e-07, + "logits/chosen": -2.668735980987549, + "logits/rejected": -2.2114713191986084, + "logps/chosen": -189.68881225585938, + "logps/rejected": -1241.6612548828125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1798040866851807, + "rewards/margins": 10.79493522644043, + "rewards/rejected": -11.974739074707031, + "step": 14020 + }, + { + "epoch": 0.84, + "learning_rate": 3.9569047468215967e-07, + "logits/chosen": -2.6559648513793945, + "logits/rejected": -2.111429214477539, + "logps/chosen": -171.84005737304688, + "logps/rejected": -1269.4163818359375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9797341227531433, + "rewards/margins": 11.274862289428711, + "rewards/rejected": -12.254595756530762, + "step": 14030 + }, + { + "epoch": 0.84, + "learning_rate": 3.9288550087323687e-07, + "logits/chosen": -2.631896495819092, + "logits/rejected": -2.1895270347595215, + "logps/chosen": -183.16319274902344, + "logps/rejected": -1329.515869140625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1326725482940674, + "rewards/margins": 11.718535423278809, + "rewards/rejected": -12.85120964050293, + "step": 14040 + }, + { + "epoch": 0.84, + "learning_rate": 3.900896563454226e-07, + "logits/chosen": -2.6776461601257324, + "logits/rejected": -2.24357271194458, + "logps/chosen": -166.2142333984375, + "logps/rejected": -1312.224365234375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9974905252456665, + "rewards/margins": 11.684764862060547, + "rewards/rejected": -12.682255744934082, + "step": 14050 + }, + { + "epoch": 0.84, + "learning_rate": 3.873029532119868e-07, + "logits/chosen": -2.646001100540161, + "logits/rejected": -2.2312138080596924, + "logps/chosen": -154.52357482910156, + "logps/rejected": -1280.1116943359375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8567987680435181, + "rewards/margins": 11.488916397094727, + "rewards/rejected": -12.345715522766113, + "step": 14060 + }, + { + "epoch": 0.84, + "learning_rate": 3.845254035465951e-07, + "logits/chosen": -2.6889290809631348, + "logits/rejected": -2.277940273284912, + "logps/chosen": -176.64369201660156, + "logps/rejected": -1371.5977783203125, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0301624536514282, + "rewards/margins": 12.242680549621582, + "rewards/rejected": -13.272842407226562, + "step": 14070 + }, + { + "epoch": 0.84, + "learning_rate": 3.8175701938325677e-07, + "logits/chosen": -2.6686508655548096, + "logits/rejected": -2.156785488128662, + "logps/chosen": -173.02218627929688, + "logps/rejected": -1296.3160400390625, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0174912214279175, + "rewards/margins": 11.499128341674805, + "rewards/rejected": -12.516620635986328, + "step": 14080 + }, + { + "epoch": 0.84, + "learning_rate": 3.7899781271626747e-07, + "logits/chosen": -2.692993402481079, + "logits/rejected": -2.2525722980499268, + "logps/chosen": -161.3787841796875, + "logps/rejected": -1306.217529296875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8953760266304016, + "rewards/margins": 11.722616195678711, + "rewards/rejected": -12.617993354797363, + "step": 14090 + }, + { + "epoch": 0.84, + "learning_rate": 3.76247795500162e-07, + "logits/chosen": -2.6358532905578613, + "logits/rejected": -2.2306854724884033, + "logps/chosen": -182.1425323486328, + "logps/rejected": -1269.7691650390625, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1269450187683105, + "rewards/margins": 11.12839412689209, + "rewards/rejected": -12.255338668823242, + "step": 14100 + }, + { + "epoch": 0.84, + "learning_rate": 3.73506979649661e-07, + "logits/chosen": -2.64140248298645, + "logits/rejected": -2.179159641265869, + "logps/chosen": -164.57003784179688, + "logps/rejected": -1269.4193115234375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9803975820541382, + "rewards/margins": 11.26661205291748, + "rewards/rejected": -12.247010231018066, + "step": 14110 + }, + { + "epoch": 0.84, + "learning_rate": 3.707753770396197e-07, + "logits/chosen": -2.663074254989624, + "logits/rejected": -2.2346599102020264, + "logps/chosen": -192.89430236816406, + "logps/rejected": -1316.9742431640625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.185072422027588, + "rewards/margins": 11.541204452514648, + "rewards/rejected": -12.726276397705078, + "step": 14120 + }, + { + "epoch": 0.84, + "learning_rate": 3.6805299950497366e-07, + "logits/chosen": -2.6560990810394287, + "logits/rejected": -2.23017954826355, + "logps/chosen": -189.10353088378906, + "logps/rejected": -1307.087890625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.142559289932251, + "rewards/margins": 11.477446556091309, + "rewards/rejected": -12.620006561279297, + "step": 14130 + }, + { + "epoch": 0.84, + "learning_rate": 3.653398588406937e-07, + "logits/chosen": -2.682931661605835, + "logits/rejected": -2.249525547027588, + "logps/chosen": -176.5172119140625, + "logps/rejected": -1271.5799560546875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.034685730934143, + "rewards/margins": 11.222209930419922, + "rewards/rejected": -12.256896018981934, + "step": 14140 + }, + { + "epoch": 0.84, + "learning_rate": 3.626359668017285e-07, + "logits/chosen": -2.6658554077148438, + "logits/rejected": -2.208019733428955, + "logps/chosen": -179.17420959472656, + "logps/rejected": -1218.5045166015625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0749691724777222, + "rewards/margins": 10.65962028503418, + "rewards/rejected": -11.734590530395508, + "step": 14150 + }, + { + "epoch": 0.84, + "learning_rate": 3.5994133510295517e-07, + "logits/chosen": -2.687696695327759, + "logits/rejected": -2.280946731567383, + "logps/chosen": -171.23837280273438, + "logps/rejected": -1219.3798828125, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9967787861824036, + "rewards/margins": 10.754220962524414, + "rewards/rejected": -11.751001358032227, + "step": 14160 + }, + { + "epoch": 0.84, + "learning_rate": 3.572559754191332e-07, + "logits/chosen": -2.6567318439483643, + "logits/rejected": -2.2087929248809814, + "logps/chosen": -171.87026977539062, + "logps/rejected": -1285.305908203125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9473191499710083, + "rewards/margins": 11.470184326171875, + "rewards/rejected": -12.417502403259277, + "step": 14170 + }, + { + "epoch": 0.85, + "learning_rate": 3.545798993848465e-07, + "logits/chosen": -2.623717784881592, + "logits/rejected": -2.213705539703369, + "logps/chosen": -174.06163024902344, + "logps/rejected": -1281.0960693359375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.045467495918274, + "rewards/margins": 11.316000938415527, + "rewards/rejected": -12.361467361450195, + "step": 14180 + }, + { + "epoch": 0.85, + "learning_rate": 3.51913118594458e-07, + "logits/chosen": -2.6105799674987793, + "logits/rejected": -2.276928424835205, + "logps/chosen": -172.11465454101562, + "logps/rejected": -1310.3248291015625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0431947708129883, + "rewards/margins": 11.61866283416748, + "rewards/rejected": -12.661857604980469, + "step": 14190 + }, + { + "epoch": 0.85, + "learning_rate": 3.492556446020587e-07, + "logits/chosen": -2.6462557315826416, + "logits/rejected": -2.229262590408325, + "logps/chosen": -180.5137176513672, + "logps/rejected": -1249.146240234375, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0995638370513916, + "rewards/margins": 10.961332321166992, + "rewards/rejected": -12.060895919799805, + "step": 14200 + }, + { + "epoch": 0.85, + "learning_rate": 3.466074889214169e-07, + "logits/chosen": -2.633293390274048, + "logits/rejected": -2.2219607830047607, + "logps/chosen": -198.06295776367188, + "logps/rejected": -1143.631591796875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.283493161201477, + "rewards/margins": 9.719396591186523, + "rewards/rejected": -11.002891540527344, + "step": 14210 + }, + { + "epoch": 0.85, + "learning_rate": 3.4396866302592593e-07, + "logits/chosen": -2.7021024227142334, + "logits/rejected": -2.3271546363830566, + "logps/chosen": -168.0321807861328, + "logps/rejected": -1253.287353515625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9981589317321777, + "rewards/margins": 11.080887794494629, + "rewards/rejected": -12.079048156738281, + "step": 14220 + }, + { + "epoch": 0.85, + "learning_rate": 3.413391783485606e-07, + "logits/chosen": -2.659151315689087, + "logits/rejected": -2.2468862533569336, + "logps/chosen": -188.7427978515625, + "logps/rejected": -1302.943359375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.18603515625, + "rewards/margins": 11.406412124633789, + "rewards/rejected": -12.592448234558105, + "step": 14230 + }, + { + "epoch": 0.85, + "learning_rate": 3.3871904628182267e-07, + "logits/chosen": -2.6383728981018066, + "logits/rejected": -2.2224040031433105, + "logps/chosen": -189.111083984375, + "logps/rejected": -1412.593505859375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1670124530792236, + "rewards/margins": 12.524897575378418, + "rewards/rejected": -13.691909790039062, + "step": 14240 + }, + { + "epoch": 0.85, + "learning_rate": 3.361082781776906e-07, + "logits/chosen": -2.6593356132507324, + "logits/rejected": -2.2361083030700684, + "logps/chosen": -156.43466186523438, + "logps/rejected": -1166.180908203125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9144033193588257, + "rewards/margins": 10.305667877197266, + "rewards/rejected": -11.220071792602539, + "step": 14250 + }, + { + "epoch": 0.85, + "learning_rate": 3.335068853475762e-07, + "logits/chosen": -2.6842901706695557, + "logits/rejected": -2.210298538208008, + "logps/chosen": -189.82516479492188, + "logps/rejected": -1330.842041015625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.173034429550171, + "rewards/margins": 11.690396308898926, + "rewards/rejected": -12.863430976867676, + "step": 14260 + }, + { + "epoch": 0.85, + "learning_rate": 3.309148790622688e-07, + "logits/chosen": -2.697014331817627, + "logits/rejected": -2.1413753032684326, + "logps/chosen": -171.2134246826172, + "logps/rejected": -1357.882080078125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0021451711654663, + "rewards/margins": 12.136377334594727, + "rewards/rejected": -13.138522148132324, + "step": 14270 + }, + { + "epoch": 0.85, + "learning_rate": 3.2833227055189126e-07, + "logits/chosen": -2.6973605155944824, + "logits/rejected": -2.2502338886260986, + "logps/chosen": -174.47206115722656, + "logps/rejected": -1272.278564453125, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0462062358856201, + "rewards/margins": 11.218513488769531, + "rewards/rejected": -12.264719009399414, + "step": 14280 + }, + { + "epoch": 0.85, + "learning_rate": 3.2575907100584976e-07, + "logits/chosen": -2.6261942386627197, + "logits/rejected": -2.160449981689453, + "logps/chosen": -168.2234344482422, + "logps/rejected": -1224.8074951171875, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.976810097694397, + "rewards/margins": 10.81821060180664, + "rewards/rejected": -11.79502010345459, + "step": 14290 + }, + { + "epoch": 0.85, + "learning_rate": 3.2319529157278427e-07, + "logits/chosen": -2.6749837398529053, + "logits/rejected": -2.167555809020996, + "logps/chosen": -156.91146850585938, + "logps/rejected": -1224.4534912109375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8893406987190247, + "rewards/margins": 10.914307594299316, + "rewards/rejected": -11.803647994995117, + "step": 14300 + }, + { + "epoch": 0.85, + "learning_rate": 3.2064094336052176e-07, + "logits/chosen": -2.660374641418457, + "logits/rejected": -2.2785587310791016, + "logps/chosen": -180.96017456054688, + "logps/rejected": -1205.510009765625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1168030500411987, + "rewards/margins": 10.501714706420898, + "rewards/rejected": -11.618518829345703, + "step": 14310 + }, + { + "epoch": 0.85, + "learning_rate": 3.1809603743602783e-07, + "logits/chosen": -2.633387327194214, + "logits/rejected": -2.2793474197387695, + "logps/chosen": -198.32861328125, + "logps/rejected": -1229.17431640625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2924237251281738, + "rewards/margins": 10.56283187866211, + "rewards/rejected": -11.855255126953125, + "step": 14320 + }, + { + "epoch": 0.85, + "learning_rate": 3.1556058482535817e-07, + "logits/chosen": -2.722856044769287, + "logits/rejected": -2.251875162124634, + "logps/chosen": -165.18190002441406, + "logps/rejected": -1172.5921630859375, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8998897671699524, + "rewards/margins": 10.391897201538086, + "rewards/rejected": -11.291788101196289, + "step": 14330 + }, + { + "epoch": 0.86, + "learning_rate": 3.1303459651361027e-07, + "logits/chosen": -2.690218687057495, + "logits/rejected": -2.2556252479553223, + "logps/chosen": -183.59963989257812, + "logps/rejected": -1241.440185546875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1058900356292725, + "rewards/margins": 10.85296630859375, + "rewards/rejected": -11.958856582641602, + "step": 14340 + }, + { + "epoch": 0.86, + "learning_rate": 3.105180834448776e-07, + "logits/chosen": -2.6766045093536377, + "logits/rejected": -2.1780879497528076, + "logps/chosen": -178.98837280273438, + "logps/rejected": -1316.917724609375, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0893800258636475, + "rewards/margins": 11.631126403808594, + "rewards/rejected": -12.72050666809082, + "step": 14350 + }, + { + "epoch": 0.86, + "learning_rate": 3.080110565222008e-07, + "logits/chosen": -2.658325433731079, + "logits/rejected": -2.1948657035827637, + "logps/chosen": -170.4897918701172, + "logps/rejected": -1229.463623046875, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9832500219345093, + "rewards/margins": 10.863972663879395, + "rewards/rejected": -11.847222328186035, + "step": 14360 + }, + { + "epoch": 0.86, + "learning_rate": 3.05513526607521e-07, + "logits/chosen": -2.651050090789795, + "logits/rejected": -2.208951473236084, + "logps/chosen": -170.09188842773438, + "logps/rejected": -1252.8548583984375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9993183016777039, + "rewards/margins": 11.10303020477295, + "rewards/rejected": -12.102350234985352, + "step": 14370 + }, + { + "epoch": 0.86, + "learning_rate": 3.0302550452163294e-07, + "logits/chosen": -2.65950608253479, + "logits/rejected": -2.1992413997650146, + "logps/chosen": -183.9871368408203, + "logps/rejected": -1185.7957763671875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0926755666732788, + "rewards/margins": 10.318548202514648, + "rewards/rejected": -11.411224365234375, + "step": 14380 + }, + { + "epoch": 0.86, + "learning_rate": 3.0054700104413666e-07, + "logits/chosen": -2.667231321334839, + "logits/rejected": -2.3050522804260254, + "logps/chosen": -178.59092712402344, + "logps/rejected": -1189.386962890625, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0339524745941162, + "rewards/margins": 10.397581100463867, + "rewards/rejected": -11.431532859802246, + "step": 14390 + }, + { + "epoch": 0.86, + "learning_rate": 2.980780269133937e-07, + "logits/chosen": -2.64802622795105, + "logits/rejected": -2.2131125926971436, + "logps/chosen": -169.2388916015625, + "logps/rejected": -1173.3343505859375, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9914522171020508, + "rewards/margins": 10.293741226196289, + "rewards/rejected": -11.28519344329834, + "step": 14400 + }, + { + "epoch": 0.86, + "learning_rate": 2.956185928264757e-07, + "logits/chosen": -2.6555652618408203, + "logits/rejected": -2.193197011947632, + "logps/chosen": -163.09716796875, + "logps/rejected": -1338.634033203125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.956673800945282, + "rewards/margins": 11.981584548950195, + "rewards/rejected": -12.938260078430176, + "step": 14410 + }, + { + "epoch": 0.86, + "learning_rate": 2.9316870943912554e-07, + "logits/chosen": -2.6932625770568848, + "logits/rejected": -2.255809783935547, + "logps/chosen": -193.12936401367188, + "logps/rejected": -1331.60791015625, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1898857355117798, + "rewards/margins": 11.672222137451172, + "rewards/rejected": -12.86210823059082, + "step": 14420 + }, + { + "epoch": 0.86, + "learning_rate": 2.9072838736570243e-07, + "logits/chosen": -2.6873533725738525, + "logits/rejected": -2.2109134197235107, + "logps/chosen": -182.5050811767578, + "logps/rejected": -1252.6083984375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1479988098144531, + "rewards/margins": 10.920316696166992, + "rewards/rejected": -12.068315505981445, + "step": 14430 + }, + { + "epoch": 0.86, + "learning_rate": 2.8829763717914266e-07, + "logits/chosen": -2.671875, + "logits/rejected": -2.201972723007202, + "logps/chosen": -163.68991088867188, + "logps/rejected": -1191.8785400390625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9103431701660156, + "rewards/margins": 10.550494194030762, + "rewards/rejected": -11.460836410522461, + "step": 14440 + }, + { + "epoch": 0.86, + "learning_rate": 2.8587646941091116e-07, + "logits/chosen": -2.706421375274658, + "logits/rejected": -2.1908249855041504, + "logps/chosen": -162.16207885742188, + "logps/rejected": -1268.368896484375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9255984425544739, + "rewards/margins": 11.313652038574219, + "rewards/rejected": -12.239252090454102, + "step": 14450 + }, + { + "epoch": 0.86, + "learning_rate": 2.834648945509552e-07, + "logits/chosen": -2.6842308044433594, + "logits/rejected": -2.2188427448272705, + "logps/chosen": -172.319580078125, + "logps/rejected": -1205.335693359375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9818706512451172, + "rewards/margins": 10.621633529663086, + "rewards/rejected": -11.603503227233887, + "step": 14460 + }, + { + "epoch": 0.86, + "learning_rate": 2.810629230476611e-07, + "logits/chosen": -2.613492250442505, + "logits/rejected": -2.1703238487243652, + "logps/chosen": -179.2425079345703, + "logps/rejected": -1238.242919921875, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0603941679000854, + "rewards/margins": 10.867496490478516, + "rewards/rejected": -11.927891731262207, + "step": 14470 + }, + { + "epoch": 0.86, + "learning_rate": 2.786705653078062e-07, + "logits/chosen": -2.6873011589050293, + "logits/rejected": -2.2895147800445557, + "logps/chosen": -157.36329650878906, + "logps/rejected": -1272.4246826171875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9024655222892761, + "rewards/margins": 11.387192726135254, + "rewards/rejected": -12.289657592773438, + "step": 14480 + }, + { + "epoch": 0.86, + "learning_rate": 2.76287831696517e-07, + "logits/chosen": -2.686208724975586, + "logits/rejected": -2.3067870140075684, + "logps/chosen": -168.56027221679688, + "logps/rejected": -1299.931396484375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9828259348869324, + "rewards/margins": 11.578295707702637, + "rewards/rejected": -12.561121940612793, + "step": 14490 + }, + { + "epoch": 0.86, + "learning_rate": 2.7391473253722017e-07, + "logits/chosen": -2.6310219764709473, + "logits/rejected": -2.2018754482269287, + "logps/chosen": -189.600830078125, + "logps/rejected": -1253.774169921875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.195866346359253, + "rewards/margins": 10.903592109680176, + "rewards/rejected": -12.099458694458008, + "step": 14500 + }, + { + "epoch": 0.87, + "learning_rate": 2.7155127811160336e-07, + "logits/chosen": -2.679464817047119, + "logits/rejected": -2.1640472412109375, + "logps/chosen": -157.51136779785156, + "logps/rejected": -1292.6475830078125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8621135950088501, + "rewards/margins": 11.614855766296387, + "rewards/rejected": -12.476968765258789, + "step": 14510 + }, + { + "epoch": 0.87, + "learning_rate": 2.6919747865956413e-07, + "logits/chosen": -2.7105636596679688, + "logits/rejected": -2.191953659057617, + "logps/chosen": -165.73458862304688, + "logps/rejected": -1390.4169921875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9154362678527832, + "rewards/margins": 12.541097640991211, + "rewards/rejected": -13.45653247833252, + "step": 14520 + }, + { + "epoch": 0.87, + "learning_rate": 2.668533443791707e-07, + "logits/chosen": -2.6745681762695312, + "logits/rejected": -2.2801852226257324, + "logps/chosen": -178.88168334960938, + "logps/rejected": -1224.2813720703125, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.053114414215088, + "rewards/margins": 10.73914909362793, + "rewards/rejected": -11.792263984680176, + "step": 14530 + }, + { + "epoch": 0.87, + "learning_rate": 2.645188854266162e-07, + "logits/chosen": -2.70839524269104, + "logits/rejected": -2.3101611137390137, + "logps/chosen": -179.01614379882812, + "logps/rejected": -1197.5069580078125, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0526068210601807, + "rewards/margins": 10.463216781616211, + "rewards/rejected": -11.515823364257812, + "step": 14540 + }, + { + "epoch": 0.87, + "learning_rate": 2.621941119161739e-07, + "logits/chosen": -2.6644093990325928, + "logits/rejected": -2.1778931617736816, + "logps/chosen": -160.2152862548828, + "logps/rejected": -1215.9842529296875, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9512295722961426, + "rewards/margins": 10.764131546020508, + "rewards/rejected": -11.715360641479492, + "step": 14550 + }, + { + "epoch": 0.87, + "learning_rate": 2.598790339201537e-07, + "logits/chosen": -2.653076648712158, + "logits/rejected": -2.2415120601654053, + "logps/chosen": -207.11050415039062, + "logps/rejected": -1230.3896484375, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4329458475112915, + "rewards/margins": 10.431455612182617, + "rewards/rejected": -11.864401817321777, + "step": 14560 + }, + { + "epoch": 0.87, + "learning_rate": 2.575736614688595e-07, + "logits/chosen": -2.6746716499328613, + "logits/rejected": -2.2277660369873047, + "logps/chosen": -190.6104736328125, + "logps/rejected": -1247.4610595703125, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2100681066513062, + "rewards/margins": 10.824273109436035, + "rewards/rejected": -12.034341812133789, + "step": 14570 + }, + { + "epoch": 0.87, + "learning_rate": 2.552780045505446e-07, + "logits/chosen": -2.6615734100341797, + "logits/rejected": -2.192617893218994, + "logps/chosen": -164.65646362304688, + "logps/rejected": -1250.136474609375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9407480359077454, + "rewards/margins": 11.12216854095459, + "rewards/rejected": -12.062917709350586, + "step": 14580 + }, + { + "epoch": 0.87, + "learning_rate": 2.529920731113672e-07, + "logits/chosen": -2.6694424152374268, + "logits/rejected": -2.240365743637085, + "logps/chosen": -186.12057495117188, + "logps/rejected": -1314.60107421875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1555207967758179, + "rewards/margins": 11.549055099487305, + "rewards/rejected": -12.704575538635254, + "step": 14590 + }, + { + "epoch": 0.87, + "learning_rate": 2.507158770553528e-07, + "logits/chosen": -2.6896824836730957, + "logits/rejected": -2.152794122695923, + "logps/chosen": -206.90377807617188, + "logps/rejected": -1183.181884765625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2904070615768433, + "rewards/margins": 10.098420143127441, + "rewards/rejected": -11.388827323913574, + "step": 14600 + }, + { + "epoch": 0.87, + "learning_rate": 2.484494262443429e-07, + "logits/chosen": -2.669369697570801, + "logits/rejected": -2.1714138984680176, + "logps/chosen": -148.35142517089844, + "logps/rejected": -1362.5677490234375, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7646754384040833, + "rewards/margins": 12.405680656433105, + "rewards/rejected": -13.170356750488281, + "step": 14610 + }, + { + "epoch": 0.87, + "learning_rate": 2.4619273049796e-07, + "logits/chosen": -2.6571056842803955, + "logits/rejected": -2.2819979190826416, + "logps/chosen": -172.59158325195312, + "logps/rejected": -1309.4736328125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0510157346725464, + "rewards/margins": 11.601158142089844, + "rewards/rejected": -12.652173042297363, + "step": 14620 + }, + { + "epoch": 0.87, + "learning_rate": 2.439457995935604e-07, + "logits/chosen": -2.6655564308166504, + "logits/rejected": -2.193443775177002, + "logps/chosen": -166.09619140625, + "logps/rejected": -1301.968017578125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.976818859577179, + "rewards/margins": 11.59904956817627, + "rewards/rejected": -12.575868606567383, + "step": 14630 + }, + { + "epoch": 0.87, + "learning_rate": 2.417086432661939e-07, + "logits/chosen": -2.676795482635498, + "logits/rejected": -2.2105135917663574, + "logps/chosen": -160.29208374023438, + "logps/rejected": -1297.194580078125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.863673985004425, + "rewards/margins": 11.653975486755371, + "rewards/rejected": -12.517648696899414, + "step": 14640 + }, + { + "epoch": 0.87, + "learning_rate": 2.394812712085598e-07, + "logits/chosen": -2.664961338043213, + "logits/rejected": -2.32708740234375, + "logps/chosen": -160.30398559570312, + "logps/rejected": -1327.02734375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9610745310783386, + "rewards/margins": 11.851113319396973, + "rewards/rejected": -12.812185287475586, + "step": 14650 + }, + { + "epoch": 0.87, + "learning_rate": 2.3726369307096765e-07, + "logits/chosen": -2.665367841720581, + "logits/rejected": -2.303982734680176, + "logps/chosen": -185.8868865966797, + "logps/rejected": -1194.243408203125, + "loss": 0.0382, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1416006088256836, + "rewards/margins": 10.349721908569336, + "rewards/rejected": -11.49132251739502, + "step": 14660 + }, + { + "epoch": 0.87, + "learning_rate": 2.3505591846129356e-07, + "logits/chosen": -2.67551851272583, + "logits/rejected": -2.2528491020202637, + "logps/chosen": -192.8577423095703, + "logps/rejected": -1303.645263671875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2152811288833618, + "rewards/margins": 11.37720012664795, + "rewards/rejected": -12.59248161315918, + "step": 14670 + }, + { + "epoch": 0.88, + "learning_rate": 2.3285795694493686e-07, + "logits/chosen": -2.6500518321990967, + "logits/rejected": -2.2434792518615723, + "logps/chosen": -184.54811096191406, + "logps/rejected": -1109.455810546875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.110153079032898, + "rewards/margins": 9.554444313049316, + "rewards/rejected": -10.66459846496582, + "step": 14680 + }, + { + "epoch": 0.88, + "learning_rate": 2.3066981804478416e-07, + "logits/chosen": -2.622807741165161, + "logits/rejected": -2.1578707695007324, + "logps/chosen": -168.41107177734375, + "logps/rejected": -1361.412841796875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0306992530822754, + "rewards/margins": 12.140836715698242, + "rewards/rejected": -13.171536445617676, + "step": 14690 + }, + { + "epoch": 0.88, + "learning_rate": 2.2849151124116148e-07, + "logits/chosen": -2.643070697784424, + "logits/rejected": -2.2879068851470947, + "logps/chosen": -182.92086791992188, + "logps/rejected": -1253.0843505859375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.120378851890564, + "rewards/margins": 10.963888168334961, + "rewards/rejected": -12.084267616271973, + "step": 14700 + }, + { + "epoch": 0.88, + "learning_rate": 2.2632304597179827e-07, + "logits/chosen": -2.6435441970825195, + "logits/rejected": -2.167996644973755, + "logps/chosen": -186.99679565429688, + "logps/rejected": -1212.7271728515625, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1710927486419678, + "rewards/margins": 10.504181861877441, + "rewards/rejected": -11.675273895263672, + "step": 14710 + }, + { + "epoch": 0.88, + "learning_rate": 2.2416443163178342e-07, + "logits/chosen": -2.6479573249816895, + "logits/rejected": -2.1469974517822266, + "logps/chosen": -167.46426391601562, + "logps/rejected": -1331.291748046875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9423903226852417, + "rewards/margins": 11.911943435668945, + "rewards/rejected": -12.854333877563477, + "step": 14720 + }, + { + "epoch": 0.88, + "learning_rate": 2.2201567757352631e-07, + "logits/chosen": -2.6761555671691895, + "logits/rejected": -2.2536380290985107, + "logps/chosen": -175.158203125, + "logps/rejected": -1269.8704833984375, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0826818943023682, + "rewards/margins": 11.166032791137695, + "rewards/rejected": -12.248714447021484, + "step": 14730 + }, + { + "epoch": 0.88, + "learning_rate": 2.1987679310671582e-07, + "logits/chosen": -2.702472686767578, + "logits/rejected": -2.3169729709625244, + "logps/chosen": -167.0040283203125, + "logps/rejected": -1179.12158203125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.938464343547821, + "rewards/margins": 10.411504745483398, + "rewards/rejected": -11.349969863891602, + "step": 14740 + }, + { + "epoch": 0.88, + "learning_rate": 2.1774778749827946e-07, + "logits/chosen": -2.6422219276428223, + "logits/rejected": -2.2057957649230957, + "logps/chosen": -184.132080078125, + "logps/rejected": -1134.3599853515625, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1921025514602661, + "rewards/margins": 9.710203170776367, + "rewards/rejected": -10.902303695678711, + "step": 14750 + }, + { + "epoch": 0.88, + "learning_rate": 2.1562866997234421e-07, + "logits/chosen": -2.6585116386413574, + "logits/rejected": -2.2246639728546143, + "logps/chosen": -177.4308319091797, + "logps/rejected": -1236.3043212890625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1117875576019287, + "rewards/margins": 10.802475929260254, + "rewards/rejected": -11.914262771606445, + "step": 14760 + }, + { + "epoch": 0.88, + "learning_rate": 2.1351944971019362e-07, + "logits/chosen": -2.687551498413086, + "logits/rejected": -2.2716846466064453, + "logps/chosen": -175.306396484375, + "logps/rejected": -1306.552978515625, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.029445767402649, + "rewards/margins": 11.590786933898926, + "rewards/rejected": -12.620233535766602, + "step": 14770 + }, + { + "epoch": 0.88, + "learning_rate": 2.1142013585023464e-07, + "logits/chosen": -2.633091688156128, + "logits/rejected": -2.1895394325256348, + "logps/chosen": -192.74295043945312, + "logps/rejected": -1113.7147216796875, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2129895687103271, + "rewards/margins": 9.494071006774902, + "rewards/rejected": -10.707061767578125, + "step": 14780 + }, + { + "epoch": 0.88, + "learning_rate": 2.0933073748794996e-07, + "logits/chosen": -2.716001272201538, + "logits/rejected": -2.353024482727051, + "logps/chosen": -208.6348419189453, + "logps/rejected": -1199.489013671875, + "loss": 0.0202, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3700079917907715, + "rewards/margins": 10.19678020477295, + "rewards/rejected": -11.566787719726562, + "step": 14790 + }, + { + "epoch": 0.88, + "learning_rate": 2.072512636758639e-07, + "logits/chosen": -2.6699705123901367, + "logits/rejected": -2.196615219116211, + "logps/chosen": -187.41168212890625, + "logps/rejected": -1271.893798828125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.151033639907837, + "rewards/margins": 11.120112419128418, + "rewards/rejected": -12.271146774291992, + "step": 14800 + }, + { + "epoch": 0.88, + "learning_rate": 2.051817234235015e-07, + "logits/chosen": -2.6464037895202637, + "logits/rejected": -2.190279722213745, + "logps/chosen": -162.7576446533203, + "logps/rejected": -1345.40673828125, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9371393322944641, + "rewards/margins": 12.077689170837402, + "rewards/rejected": -13.0148286819458, + "step": 14810 + }, + { + "epoch": 0.88, + "learning_rate": 2.0312212569735035e-07, + "logits/chosen": -2.620392322540283, + "logits/rejected": -2.186061382293701, + "logps/chosen": -193.6808624267578, + "logps/rejected": -1146.101806640625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2068160772323608, + "rewards/margins": 9.815080642700195, + "rewards/rejected": -11.021897315979004, + "step": 14820 + }, + { + "epoch": 0.88, + "learning_rate": 2.0107247942081963e-07, + "logits/chosen": -2.70564603805542, + "logits/rejected": -2.2533860206604004, + "logps/chosen": -169.4835968017578, + "logps/rejected": -1190.15185546875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9564429521560669, + "rewards/margins": 10.50793743133545, + "rewards/rejected": -11.464380264282227, + "step": 14830 + }, + { + "epoch": 0.88, + "learning_rate": 1.990327934742045e-07, + "logits/chosen": -2.618466854095459, + "logits/rejected": -2.245718002319336, + "logps/chosen": -176.75430297851562, + "logps/rejected": -1181.525146484375, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0724035501480103, + "rewards/margins": 10.301519393920898, + "rewards/rejected": -11.373924255371094, + "step": 14840 + }, + { + "epoch": 0.89, + "learning_rate": 1.9700307669464515e-07, + "logits/chosen": -2.627598762512207, + "logits/rejected": -2.1568453311920166, + "logps/chosen": -161.8828582763672, + "logps/rejected": -1258.74267578125, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9156071543693542, + "rewards/margins": 11.22768497467041, + "rewards/rejected": -12.143292427062988, + "step": 14850 + }, + { + "epoch": 0.89, + "learning_rate": 1.949833378760882e-07, + "logits/chosen": -2.6692910194396973, + "logits/rejected": -2.1686606407165527, + "logps/chosen": -169.31985473632812, + "logps/rejected": -1167.5325927734375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9801681637763977, + "rewards/margins": 10.247018814086914, + "rewards/rejected": -11.22718620300293, + "step": 14860 + }, + { + "epoch": 0.89, + "learning_rate": 1.92973585769253e-07, + "logits/chosen": -2.695345401763916, + "logits/rejected": -2.209428310394287, + "logps/chosen": -190.50949096679688, + "logps/rejected": -1161.2869873046875, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.144087791442871, + "rewards/margins": 10.028764724731445, + "rewards/rejected": -11.172852516174316, + "step": 14870 + }, + { + "epoch": 0.89, + "learning_rate": 1.9097382908158713e-07, + "logits/chosen": -2.6855318546295166, + "logits/rejected": -2.2859859466552734, + "logps/chosen": -168.3792724609375, + "logps/rejected": -1155.4234619140625, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.996250331401825, + "rewards/margins": 10.118196487426758, + "rewards/rejected": -11.114445686340332, + "step": 14880 + }, + { + "epoch": 0.89, + "learning_rate": 1.8898407647723327e-07, + "logits/chosen": -2.6655187606811523, + "logits/rejected": -2.2823891639709473, + "logps/chosen": -170.9694366455078, + "logps/rejected": -1249.5794677734375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0004255771636963, + "rewards/margins": 11.049158096313477, + "rewards/rejected": -12.049583435058594, + "step": 14890 + }, + { + "epoch": 0.89, + "learning_rate": 1.8700433657699162e-07, + "logits/chosen": -2.673092842102051, + "logits/rejected": -2.284445285797119, + "logps/chosen": -179.8528289794922, + "logps/rejected": -1253.5740966796875, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0807421207427979, + "rewards/margins": 11.017679214477539, + "rewards/rejected": -12.098422050476074, + "step": 14900 + }, + { + "epoch": 0.89, + "learning_rate": 1.8503461795827958e-07, + "logits/chosen": -2.682305335998535, + "logits/rejected": -2.2803447246551514, + "logps/chosen": -182.93017578125, + "logps/rejected": -1284.439697265625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1043970584869385, + "rewards/margins": 11.29609489440918, + "rewards/rejected": -12.400491714477539, + "step": 14910 + }, + { + "epoch": 0.89, + "learning_rate": 1.8307492915509705e-07, + "logits/chosen": -2.635374069213867, + "logits/rejected": -2.2397847175598145, + "logps/chosen": -180.3602752685547, + "logps/rejected": -1274.7506103515625, + "loss": 0.0267, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.045652985572815, + "rewards/margins": 11.249390602111816, + "rewards/rejected": -12.2950439453125, + "step": 14920 + }, + { + "epoch": 0.89, + "learning_rate": 1.8112527865798896e-07, + "logits/chosen": -2.6177165508270264, + "logits/rejected": -2.2755227088928223, + "logps/chosen": -188.5, + "logps/rejected": -1290.842041015625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1717121601104736, + "rewards/margins": 11.276228904724121, + "rewards/rejected": -12.4479398727417, + "step": 14930 + }, + { + "epoch": 0.89, + "learning_rate": 1.7918567491400862e-07, + "logits/chosen": -2.626162528991699, + "logits/rejected": -2.187166213989258, + "logps/chosen": -179.59324645996094, + "logps/rejected": -1198.760498046875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0399250984191895, + "rewards/margins": 10.51546859741211, + "rewards/rejected": -11.55539321899414, + "step": 14940 + }, + { + "epoch": 0.89, + "learning_rate": 1.7725612632667895e-07, + "logits/chosen": -2.627930164337158, + "logits/rejected": -2.164045810699463, + "logps/chosen": -177.1788787841797, + "logps/rejected": -1292.7125244140625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0708956718444824, + "rewards/margins": 11.406911849975586, + "rewards/rejected": -12.477807998657227, + "step": 14950 + }, + { + "epoch": 0.89, + "learning_rate": 1.7533664125596038e-07, + "logits/chosen": -2.635820150375366, + "logits/rejected": -2.2120137214660645, + "logps/chosen": -170.9647979736328, + "logps/rejected": -1227.5697021484375, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0084832906723022, + "rewards/margins": 10.832781791687012, + "rewards/rejected": -11.841263771057129, + "step": 14960 + }, + { + "epoch": 0.89, + "learning_rate": 1.7342722801821143e-07, + "logits/chosen": -2.6447854042053223, + "logits/rejected": -2.245664596557617, + "logps/chosen": -163.8087158203125, + "logps/rejected": -1210.20068359375, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9513195753097534, + "rewards/margins": 10.699905395507812, + "rewards/rejected": -11.651225090026855, + "step": 14970 + }, + { + "epoch": 0.89, + "learning_rate": 1.7152789488615124e-07, + "logits/chosen": -2.64689302444458, + "logits/rejected": -2.2711923122406006, + "logps/chosen": -205.9303741455078, + "logps/rejected": -1338.5269775390625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3396284580230713, + "rewards/margins": 11.595454216003418, + "rewards/rejected": -12.935083389282227, + "step": 14980 + }, + { + "epoch": 0.89, + "learning_rate": 1.6963865008882975e-07, + "logits/chosen": -2.685518741607666, + "logits/rejected": -2.210488796234131, + "logps/chosen": -182.5147705078125, + "logps/rejected": -1331.3974609375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1285771131515503, + "rewards/margins": 11.744684219360352, + "rewards/rejected": -12.873260498046875, + "step": 14990 + }, + { + "epoch": 0.89, + "learning_rate": 1.6775950181158462e-07, + "logits/chosen": -2.654169797897339, + "logits/rejected": -2.188753128051758, + "logps/chosen": -165.80029296875, + "logps/rejected": -1171.6915283203125, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9859506487846375, + "rewards/margins": 10.283140182495117, + "rewards/rejected": -11.26909065246582, + "step": 15000 + }, + { + "epoch": 0.9, + "learning_rate": 1.6589045819601134e-07, + "logits/chosen": -2.603675603866577, + "logits/rejected": -2.1540234088897705, + "logps/chosen": -183.5207061767578, + "logps/rejected": -1190.9810791015625, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1452476978302002, + "rewards/margins": 10.333688735961914, + "rewards/rejected": -11.478937149047852, + "step": 15010 + }, + { + "epoch": 0.9, + "learning_rate": 1.640315273399254e-07, + "logits/chosen": -2.6677446365356445, + "logits/rejected": -2.1996638774871826, + "logps/chosen": -174.00079345703125, + "logps/rejected": -1314.3968505859375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0667635202407837, + "rewards/margins": 11.641279220581055, + "rewards/rejected": -12.708044052124023, + "step": 15020 + }, + { + "epoch": 0.9, + "learning_rate": 1.621827172973281e-07, + "logits/chosen": -2.646925449371338, + "logits/rejected": -2.184150218963623, + "logps/chosen": -169.05628967285156, + "logps/rejected": -1187.2843017578125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9964550137519836, + "rewards/margins": 10.43781852722168, + "rewards/rejected": -11.434274673461914, + "step": 15030 + }, + { + "epoch": 0.9, + "learning_rate": 1.603440360783709e-07, + "logits/chosen": -2.6401660442352295, + "logits/rejected": -2.189229726791382, + "logps/chosen": -182.5482635498047, + "logps/rejected": -1237.8343505859375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1609185934066772, + "rewards/margins": 10.766833305358887, + "rewards/rejected": -11.927752494812012, + "step": 15040 + }, + { + "epoch": 0.9, + "learning_rate": 1.5851549164932118e-07, + "logits/chosen": -2.67946457862854, + "logits/rejected": -2.2476909160614014, + "logps/chosen": -173.39321899414062, + "logps/rejected": -1295.132080078125, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.021991491317749, + "rewards/margins": 11.490584373474121, + "rewards/rejected": -12.512575149536133, + "step": 15050 + }, + { + "epoch": 0.9, + "learning_rate": 1.5669709193252835e-07, + "logits/chosen": -2.652005672454834, + "logits/rejected": -2.239576816558838, + "logps/chosen": -169.68539428710938, + "logps/rejected": -1189.311279296875, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.015600323677063, + "rewards/margins": 10.432500839233398, + "rewards/rejected": -11.448099136352539, + "step": 15060 + }, + { + "epoch": 0.9, + "learning_rate": 1.5488884480638677e-07, + "logits/chosen": -2.6556079387664795, + "logits/rejected": -2.2443923950195312, + "logps/chosen": -197.99832153320312, + "logps/rejected": -1294.5548095703125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.237754225730896, + "rewards/margins": 11.259954452514648, + "rewards/rejected": -12.497709274291992, + "step": 15070 + }, + { + "epoch": 0.9, + "learning_rate": 1.5309075810530732e-07, + "logits/chosen": -2.699147939682007, + "logits/rejected": -2.2865710258483887, + "logps/chosen": -154.24337768554688, + "logps/rejected": -1166.4036865234375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.848885715007782, + "rewards/margins": 10.372926712036133, + "rewards/rejected": -11.221811294555664, + "step": 15080 + }, + { + "epoch": 0.9, + "learning_rate": 1.5130283961967614e-07, + "logits/chosen": -2.6388185024261475, + "logits/rejected": -2.1456522941589355, + "logps/chosen": -147.41668701171875, + "logps/rejected": -1251.6011962890625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.782366156578064, + "rewards/margins": 11.289183616638184, + "rewards/rejected": -12.071551322937012, + "step": 15090 + }, + { + "epoch": 0.9, + "learning_rate": 1.4952509709582673e-07, + "logits/chosen": -2.671745777130127, + "logits/rejected": -2.2714924812316895, + "logps/chosen": -164.42474365234375, + "logps/rejected": -1189.809814453125, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.96100252866745, + "rewards/margins": 10.500677108764648, + "rewards/rejected": -11.46168041229248, + "step": 15100 + }, + { + "epoch": 0.9, + "learning_rate": 1.4775753823600359e-07, + "logits/chosen": -2.6454641819000244, + "logits/rejected": -2.2221553325653076, + "logps/chosen": -185.87667846679688, + "logps/rejected": -1236.68701171875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.135982871055603, + "rewards/margins": 10.797276496887207, + "rewards/rejected": -11.933259963989258, + "step": 15110 + }, + { + "epoch": 0.9, + "learning_rate": 1.460001706983294e-07, + "logits/chosen": -2.657090187072754, + "logits/rejected": -2.199199676513672, + "logps/chosen": -168.357177734375, + "logps/rejected": -1134.2022705078125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9889437556266785, + "rewards/margins": 9.91118335723877, + "rewards/rejected": -10.900126457214355, + "step": 15120 + }, + { + "epoch": 0.9, + "learning_rate": 1.442530020967725e-07, + "logits/chosen": -2.7151927947998047, + "logits/rejected": -2.300499200820923, + "logps/chosen": -169.01512145996094, + "logps/rejected": -1328.2789306640625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.988416850566864, + "rewards/margins": 11.847478866577148, + "rewards/rejected": -12.835896492004395, + "step": 15130 + }, + { + "epoch": 0.9, + "learning_rate": 1.4251604000111275e-07, + "logits/chosen": -2.636293411254883, + "logits/rejected": -2.2017130851745605, + "logps/chosen": -160.78887939453125, + "logps/rejected": -1261.2135009765625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9082280993461609, + "rewards/margins": 11.26390552520752, + "rewards/rejected": -12.172134399414062, + "step": 15140 + }, + { + "epoch": 0.9, + "learning_rate": 1.4078929193691e-07, + "logits/chosen": -2.673943042755127, + "logits/rejected": -2.202122211456299, + "logps/chosen": -171.90130615234375, + "logps/rejected": -1177.3507080078125, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9695454835891724, + "rewards/margins": 10.366622924804688, + "rewards/rejected": -11.33616828918457, + "step": 15150 + }, + { + "epoch": 0.9, + "learning_rate": 1.3907276538546898e-07, + "logits/chosen": -2.6691880226135254, + "logits/rejected": -2.2728888988494873, + "logps/chosen": -205.15542602539062, + "logps/rejected": -1242.2779541015625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3748595714569092, + "rewards/margins": 10.599506378173828, + "rewards/rejected": -11.974365234375, + "step": 15160 + }, + { + "epoch": 0.9, + "learning_rate": 1.3736646778381159e-07, + "logits/chosen": -2.711009979248047, + "logits/rejected": -2.2541840076446533, + "logps/chosen": -189.1422882080078, + "logps/rejected": -1210.8173828125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2040817737579346, + "rewards/margins": 10.449132919311523, + "rewards/rejected": -11.653213500976562, + "step": 15170 + }, + { + "epoch": 0.91, + "learning_rate": 1.3567040652463946e-07, + "logits/chosen": -2.668684244155884, + "logits/rejected": -2.238323450088501, + "logps/chosen": -168.61758422851562, + "logps/rejected": -1393.0931396484375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9762320518493652, + "rewards/margins": 12.500678062438965, + "rewards/rejected": -13.476910591125488, + "step": 15180 + }, + { + "epoch": 0.91, + "learning_rate": 1.339845889563049e-07, + "logits/chosen": -2.6397979259490967, + "logits/rejected": -2.2097837924957275, + "logps/chosen": -149.66696166992188, + "logps/rejected": -1432.6214599609375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8102267980575562, + "rewards/margins": 13.069025039672852, + "rewards/rejected": -13.879251480102539, + "step": 15190 + }, + { + "epoch": 0.91, + "learning_rate": 1.3230902238277887e-07, + "logits/chosen": -2.686516761779785, + "logits/rejected": -2.2522130012512207, + "logps/chosen": -171.831298828125, + "logps/rejected": -1325.211181640625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0223077535629272, + "rewards/margins": 11.766373634338379, + "rewards/rejected": -12.788681030273438, + "step": 15200 + }, + { + "epoch": 0.91, + "learning_rate": 1.3064371406361854e-07, + "logits/chosen": -2.6649279594421387, + "logits/rejected": -2.2388997077941895, + "logps/chosen": -164.57571411132812, + "logps/rejected": -1251.55712890625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9790751338005066, + "rewards/margins": 11.08824348449707, + "rewards/rejected": -12.0673189163208, + "step": 15210 + }, + { + "epoch": 0.91, + "learning_rate": 1.2898867121393627e-07, + "logits/chosen": -2.571096181869507, + "logits/rejected": -2.145815134048462, + "logps/chosen": -186.50498962402344, + "logps/rejected": -1170.244873046875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2005667686462402, + "rewards/margins": 10.064101219177246, + "rewards/rejected": -11.264668464660645, + "step": 15220 + }, + { + "epoch": 0.91, + "learning_rate": 1.273439010043681e-07, + "logits/chosen": -2.6600730419158936, + "logits/rejected": -2.209486961364746, + "logps/chosen": -160.07379150390625, + "logps/rejected": -1236.22216796875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8968342542648315, + "rewards/margins": 11.02524185180664, + "rewards/rejected": -11.922076225280762, + "step": 15230 + }, + { + "epoch": 0.91, + "learning_rate": 1.2570941056104348e-07, + "logits/chosen": -2.6403045654296875, + "logits/rejected": -2.1852526664733887, + "logps/chosen": -171.4044189453125, + "logps/rejected": -1165.039306640625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0091670751571655, + "rewards/margins": 10.193971633911133, + "rewards/rejected": -11.20313835144043, + "step": 15240 + }, + { + "epoch": 0.91, + "learning_rate": 1.2408520696555183e-07, + "logits/chosen": -2.7019858360290527, + "logits/rejected": -2.315781354904175, + "logps/chosen": -184.33883666992188, + "logps/rejected": -1244.1417236328125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0852398872375488, + "rewards/margins": 10.913727760314941, + "rewards/rejected": -11.998968124389648, + "step": 15250 + }, + { + "epoch": 0.91, + "learning_rate": 1.224712972549172e-07, + "logits/chosen": -2.6305789947509766, + "logits/rejected": -2.266334056854248, + "logps/chosen": -167.66064453125, + "logps/rejected": -1269.989501953125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9983094334602356, + "rewards/margins": 11.261682510375977, + "rewards/rejected": -12.259991645812988, + "step": 15260 + }, + { + "epoch": 0.91, + "learning_rate": 1.2086768842156065e-07, + "logits/chosen": -2.675300121307373, + "logits/rejected": -2.286644697189331, + "logps/chosen": -174.8889923095703, + "logps/rejected": -1235.01171875, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.015540599822998, + "rewards/margins": 10.896650314331055, + "rewards/rejected": -11.912191390991211, + "step": 15270 + }, + { + "epoch": 0.91, + "learning_rate": 1.1927438741327652e-07, + "logits/chosen": -2.6511597633361816, + "logits/rejected": -2.2300453186035156, + "logps/chosen": -174.6057586669922, + "logps/rejected": -1153.0677490234375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0353620052337646, + "rewards/margins": 10.062275886535645, + "rewards/rejected": -11.097638130187988, + "step": 15280 + }, + { + "epoch": 0.91, + "learning_rate": 1.1769140113319755e-07, + "logits/chosen": -2.7141215801239014, + "logits/rejected": -2.2491402626037598, + "logps/chosen": -169.5233154296875, + "logps/rejected": -1164.0906982421875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9601899981498718, + "rewards/margins": 10.25045394897461, + "rewards/rejected": -11.210644721984863, + "step": 15290 + }, + { + "epoch": 0.91, + "learning_rate": 1.1611873643976839e-07, + "logits/chosen": -2.6771621704101562, + "logits/rejected": -2.2743866443634033, + "logps/chosen": -158.3846435546875, + "logps/rejected": -1224.721923828125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8705946207046509, + "rewards/margins": 10.940997123718262, + "rewards/rejected": -11.811592102050781, + "step": 15300 + }, + { + "epoch": 0.91, + "learning_rate": 1.145564001467131e-07, + "logits/chosen": -2.697965145111084, + "logits/rejected": -2.2223258018493652, + "logps/chosen": -181.2537384033203, + "logps/rejected": -1376.65966796875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0719373226165771, + "rewards/margins": 12.260189056396484, + "rewards/rejected": -13.332125663757324, + "step": 15310 + }, + { + "epoch": 0.91, + "learning_rate": 1.1300439902300814e-07, + "logits/chosen": -2.6815478801727295, + "logits/rejected": -2.2120895385742188, + "logps/chosen": -177.8232879638672, + "logps/rejected": -1183.864013671875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0381433963775635, + "rewards/margins": 10.360038757324219, + "rewards/rejected": -11.398181915283203, + "step": 15320 + }, + { + "epoch": 0.91, + "learning_rate": 1.1146273979285138e-07, + "logits/chosen": -2.655674695968628, + "logits/rejected": -2.249160051345825, + "logps/chosen": -201.2614288330078, + "logps/rejected": -1120.4149169921875, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3674007654190063, + "rewards/margins": 9.404138565063477, + "rewards/rejected": -10.771540641784668, + "step": 15330 + }, + { + "epoch": 0.91, + "learning_rate": 1.0993142913563209e-07, + "logits/chosen": -2.6248021125793457, + "logits/rejected": -2.0581984519958496, + "logps/chosen": -179.18348693847656, + "logps/rejected": -1243.94287109375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.133216142654419, + "rewards/margins": 10.868388175964355, + "rewards/rejected": -12.001604080200195, + "step": 15340 + }, + { + "epoch": 0.92, + "learning_rate": 1.0841047368590596e-07, + "logits/chosen": -2.6652657985687256, + "logits/rejected": -2.197152614593506, + "logps/chosen": -171.3666229248047, + "logps/rejected": -1206.4525146484375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0100961923599243, + "rewards/margins": 10.606683731079102, + "rewards/rejected": -11.616779327392578, + "step": 15350 + }, + { + "epoch": 0.92, + "learning_rate": 1.0689988003336121e-07, + "logits/chosen": -2.688142776489258, + "logits/rejected": -2.2145469188690186, + "logps/chosen": -178.90750122070312, + "logps/rejected": -1156.3896484375, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.052981972694397, + "rewards/margins": 10.077046394348145, + "rewards/rejected": -11.130027770996094, + "step": 15360 + }, + { + "epoch": 0.92, + "learning_rate": 1.0539965472279424e-07, + "logits/chosen": -2.648425579071045, + "logits/rejected": -2.1206278800964355, + "logps/chosen": -165.90655517578125, + "logps/rejected": -1135.87646484375, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9682438969612122, + "rewards/margins": 9.959437370300293, + "rewards/rejected": -10.927680969238281, + "step": 15370 + }, + { + "epoch": 0.92, + "learning_rate": 1.039098042540787e-07, + "logits/chosen": -2.697800397872925, + "logits/rejected": -2.2975144386291504, + "logps/chosen": -182.51307678222656, + "logps/rejected": -1195.71435546875, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0929452180862427, + "rewards/margins": 10.417900085449219, + "rewards/rejected": -11.510844230651855, + "step": 15380 + }, + { + "epoch": 0.92, + "learning_rate": 1.0243033508213873e-07, + "logits/chosen": -2.644886016845703, + "logits/rejected": -2.1226062774658203, + "logps/chosen": -163.13980102539062, + "logps/rejected": -1188.9097900390625, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9173575639724731, + "rewards/margins": 10.52003002166748, + "rewards/rejected": -11.43738842010498, + "step": 15390 + }, + { + "epoch": 0.92, + "learning_rate": 1.0096125361691993e-07, + "logits/chosen": -2.666822671890259, + "logits/rejected": -2.284858226776123, + "logps/chosen": -181.47325134277344, + "logps/rejected": -1255.91552734375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0732786655426025, + "rewards/margins": 11.037242889404297, + "rewards/rejected": -12.11052131652832, + "step": 15400 + }, + { + "epoch": 0.92, + "learning_rate": 9.950256622336258e-08, + "logits/chosen": -2.6804606914520264, + "logits/rejected": -2.273733615875244, + "logps/chosen": -165.30763244628906, + "logps/rejected": -1225.792236328125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.968244194984436, + "rewards/margins": 10.85100269317627, + "rewards/rejected": -11.819246292114258, + "step": 15410 + }, + { + "epoch": 0.92, + "learning_rate": 9.805427922137373e-08, + "logits/chosen": -2.66705060005188, + "logits/rejected": -2.1891260147094727, + "logps/chosen": -173.49383544921875, + "logps/rejected": -1332.0062255859375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.030502200126648, + "rewards/margins": 11.832608222961426, + "rewards/rejected": -12.863110542297363, + "step": 15420 + }, + { + "epoch": 0.92, + "learning_rate": 9.661639888579877e-08, + "logits/chosen": -2.6146352291107178, + "logits/rejected": -2.1669552326202393, + "logps/chosen": -169.9706268310547, + "logps/rejected": -1221.0625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1017240285873413, + "rewards/margins": 10.672877311706543, + "rewards/rejected": -11.7746000289917, + "step": 15430 + }, + { + "epoch": 0.92, + "learning_rate": 9.51889314463969e-08, + "logits/chosen": -2.71271014213562, + "logits/rejected": -2.2638492584228516, + "logps/chosen": -178.9517059326172, + "logps/rejected": -1188.2783203125, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0832500457763672, + "rewards/margins": 10.332706451416016, + "rewards/rejected": -11.415956497192383, + "step": 15440 + }, + { + "epoch": 0.92, + "learning_rate": 9.377188308781038e-08, + "logits/chosen": -2.6629462242126465, + "logits/rejected": -2.190911054611206, + "logps/chosen": -184.59182739257812, + "logps/rejected": -1255.3714599609375, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1611316204071045, + "rewards/margins": 10.942505836486816, + "rewards/rejected": -12.1036376953125, + "step": 15450 + }, + { + "epoch": 0.92, + "learning_rate": 9.236525994954142e-08, + "logits/chosen": -2.66231632232666, + "logits/rejected": -2.2185728549957275, + "logps/chosen": -167.41738891601562, + "logps/rejected": -1202.1636962890625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9807716608047485, + "rewards/margins": 10.598533630371094, + "rewards/rejected": -11.579305648803711, + "step": 15460 + }, + { + "epoch": 0.92, + "learning_rate": 9.096906812592315e-08, + "logits/chosen": -2.6808948516845703, + "logits/rejected": -2.2528421878814697, + "logps/chosen": -156.71408081054688, + "logps/rejected": -1257.4429931640625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9337954521179199, + "rewards/margins": 11.1892671585083, + "rewards/rejected": -12.123063087463379, + "step": 15470 + }, + { + "epoch": 0.92, + "learning_rate": 8.958331366609424e-08, + "logits/chosen": -2.6562304496765137, + "logits/rejected": -2.2269248962402344, + "logps/chosen": -193.720947265625, + "logps/rejected": -1293.5947265625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2302790880203247, + "rewards/margins": 11.262996673583984, + "rewards/rejected": -12.49327564239502, + "step": 15480 + }, + { + "epoch": 0.92, + "learning_rate": 8.820800257397205e-08, + "logits/chosen": -2.657881498336792, + "logits/rejected": -2.233145236968994, + "logps/chosen": -166.1521453857422, + "logps/rejected": -1257.880126953125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9509795308113098, + "rewards/margins": 11.179496765136719, + "rewards/rejected": -12.130475997924805, + "step": 15490 + }, + { + "epoch": 0.92, + "learning_rate": 8.684314080822764e-08, + "logits/chosen": -2.670079469680786, + "logits/rejected": -2.2392404079437256, + "logps/chosen": -183.1615447998047, + "logps/rejected": -1324.6661376953125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.120317816734314, + "rewards/margins": 11.68382453918457, + "rewards/rejected": -12.8041410446167, + "step": 15500 + }, + { + "epoch": 0.92, + "learning_rate": 8.54887342822594e-08, + "logits/chosen": -2.6902074813842773, + "logits/rejected": -2.2246272563934326, + "logps/chosen": -176.72415161132812, + "logps/rejected": -1282.885009765625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0135183334350586, + "rewards/margins": 11.368112564086914, + "rewards/rejected": -12.381631851196289, + "step": 15510 + }, + { + "epoch": 0.93, + "learning_rate": 8.414478886416611e-08, + "logits/chosen": -2.6867082118988037, + "logits/rejected": -2.2736759185791016, + "logps/chosen": -159.37124633789062, + "logps/rejected": -1182.3377685546875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9226862788200378, + "rewards/margins": 10.444775581359863, + "rewards/rejected": -11.367461204528809, + "step": 15520 + }, + { + "epoch": 0.93, + "learning_rate": 8.281131037672474e-08, + "logits/chosen": -2.671300172805786, + "logits/rejected": -2.2463297843933105, + "logps/chosen": -195.54315185546875, + "logps/rejected": -1259.5648193359375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2373530864715576, + "rewards/margins": 10.92322063446045, + "rewards/rejected": -12.16057300567627, + "step": 15530 + }, + { + "epoch": 0.93, + "learning_rate": 8.148830459736106e-08, + "logits/chosen": -2.653449058532715, + "logits/rejected": -2.24001145362854, + "logps/chosen": -167.10552978515625, + "logps/rejected": -1317.7305908203125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9326269030570984, + "rewards/margins": 11.81169319152832, + "rewards/rejected": -12.7443208694458, + "step": 15540 + }, + { + "epoch": 0.93, + "learning_rate": 8.017577725812825e-08, + "logits/chosen": -2.6509127616882324, + "logits/rejected": -2.250549793243408, + "logps/chosen": -178.0433349609375, + "logps/rejected": -1138.2529296875, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.051513910293579, + "rewards/margins": 9.897701263427734, + "rewards/rejected": -10.949213027954102, + "step": 15550 + }, + { + "epoch": 0.93, + "learning_rate": 7.887373404568133e-08, + "logits/chosen": -2.650705575942993, + "logits/rejected": -2.1456823348999023, + "logps/chosen": -184.0078887939453, + "logps/rejected": -1261.8377685546875, + "loss": 0.0292, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1063464879989624, + "rewards/margins": 11.06517505645752, + "rewards/rejected": -12.17152214050293, + "step": 15560 + }, + { + "epoch": 0.93, + "learning_rate": 7.758218060124916e-08, + "logits/chosen": -2.6347453594207764, + "logits/rejected": -2.2446770668029785, + "logps/chosen": -216.38247680664062, + "logps/rejected": -1281.050048828125, + "loss": 0.0514, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.415852427482605, + "rewards/margins": 10.954060554504395, + "rewards/rejected": -12.369911193847656, + "step": 15570 + }, + { + "epoch": 0.93, + "learning_rate": 7.630112252061534e-08, + "logits/chosen": -2.6698391437530518, + "logits/rejected": -2.2567455768585205, + "logps/chosen": -181.057373046875, + "logps/rejected": -1319.342041015625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.120937705039978, + "rewards/margins": 11.62074089050293, + "rewards/rejected": -12.741678237915039, + "step": 15580 + }, + { + "epoch": 0.93, + "learning_rate": 7.503056535408975e-08, + "logits/chosen": -2.6975629329681396, + "logits/rejected": -2.2926926612854004, + "logps/chosen": -180.2107696533203, + "logps/rejected": -1263.228515625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0885566473007202, + "rewards/margins": 11.096248626708984, + "rewards/rejected": -12.184804916381836, + "step": 15590 + }, + { + "epoch": 0.93, + "learning_rate": 7.377051460648682e-08, + "logits/chosen": -2.6085667610168457, + "logits/rejected": -2.229710102081299, + "logps/chosen": -170.35891723632812, + "logps/rejected": -1316.247314453125, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.012973427772522, + "rewards/margins": 11.701685905456543, + "rewards/rejected": -12.714659690856934, + "step": 15600 + }, + { + "epoch": 0.93, + "learning_rate": 7.252097573709982e-08, + "logits/chosen": -2.707313060760498, + "logits/rejected": -2.2668449878692627, + "logps/chosen": -165.257080078125, + "logps/rejected": -1292.935546875, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9548360705375671, + "rewards/margins": 11.541874885559082, + "rewards/rejected": -12.496709823608398, + "step": 15610 + }, + { + "epoch": 0.93, + "learning_rate": 7.128195415967987e-08, + "logits/chosen": -2.6687378883361816, + "logits/rejected": -2.3147215843200684, + "logps/chosen": -178.81834411621094, + "logps/rejected": -1303.671142578125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.096724510192871, + "rewards/margins": 11.488851547241211, + "rewards/rejected": -12.585575103759766, + "step": 15620 + }, + { + "epoch": 0.93, + "learning_rate": 7.005345524240926e-08, + "logits/chosen": -2.6833956241607666, + "logits/rejected": -2.1427695751190186, + "logps/chosen": -178.79551696777344, + "logps/rejected": -1276.1961669921875, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0346635580062866, + "rewards/margins": 11.280159950256348, + "rewards/rejected": -12.314825057983398, + "step": 15630 + }, + { + "epoch": 0.93, + "learning_rate": 6.883548430788062e-08, + "logits/chosen": -2.6569366455078125, + "logits/rejected": -2.2688584327697754, + "logps/chosen": -173.58758544921875, + "logps/rejected": -1221.869384765625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0539261102676392, + "rewards/margins": 10.717350959777832, + "rewards/rejected": -11.77127742767334, + "step": 15640 + }, + { + "epoch": 0.93, + "learning_rate": 6.762804663307365e-08, + "logits/chosen": -2.6584842205047607, + "logits/rejected": -2.195120334625244, + "logps/chosen": -184.82872009277344, + "logps/rejected": -1309.2528076171875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0888469219207764, + "rewards/margins": 11.556825637817383, + "rewards/rejected": -12.645673751831055, + "step": 15650 + }, + { + "epoch": 0.93, + "learning_rate": 6.643114744933038e-08, + "logits/chosen": -2.6389527320861816, + "logits/rejected": -2.224365711212158, + "logps/chosen": -171.68350219726562, + "logps/rejected": -1278.6298828125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0224170684814453, + "rewards/margins": 11.32701301574707, + "rewards/rejected": -12.3494291305542, + "step": 15660 + }, + { + "epoch": 0.93, + "learning_rate": 6.524479194233463e-08, + "logits/chosen": -2.7615578174591064, + "logits/rejected": -2.2572968006134033, + "logps/chosen": -185.48269653320312, + "logps/rejected": -1336.146240234375, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1167644262313843, + "rewards/margins": 11.79128360748291, + "rewards/rejected": -12.908047676086426, + "step": 15670 + }, + { + "epoch": 0.94, + "learning_rate": 6.406898525208843e-08, + "logits/chosen": -2.623728036880493, + "logits/rejected": -2.1538612842559814, + "logps/chosen": -166.7744598388672, + "logps/rejected": -1292.7698974609375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9642624855041504, + "rewards/margins": 11.528935432434082, + "rewards/rejected": -12.49319839477539, + "step": 15680 + }, + { + "epoch": 0.94, + "learning_rate": 6.290373247289012e-08, + "logits/chosen": -2.649784564971924, + "logits/rejected": -2.1965155601501465, + "logps/chosen": -180.64471435546875, + "logps/rejected": -1309.1988525390625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1040668487548828, + "rewards/margins": 11.546464920043945, + "rewards/rejected": -12.650530815124512, + "step": 15690 + }, + { + "epoch": 0.94, + "learning_rate": 6.174903865331177e-08, + "logits/chosen": -2.701651096343994, + "logits/rejected": -2.231921672821045, + "logps/chosen": -171.8490447998047, + "logps/rejected": -1340.428466796875, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.992447018623352, + "rewards/margins": 11.97745418548584, + "rewards/rejected": -12.969900131225586, + "step": 15700 + }, + { + "epoch": 0.94, + "learning_rate": 6.060490879617853e-08, + "logits/chosen": -2.6515955924987793, + "logits/rejected": -2.2553226947784424, + "logps/chosen": -164.45840454101562, + "logps/rejected": -1203.866455078125, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9104216694831848, + "rewards/margins": 10.70018196105957, + "rewards/rejected": -11.610605239868164, + "step": 15710 + }, + { + "epoch": 0.94, + "learning_rate": 5.947134785854597e-08, + "logits/chosen": -2.642116069793701, + "logits/rejected": -2.2558083534240723, + "logps/chosen": -168.17141723632812, + "logps/rejected": -1221.61279296875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9918482899665833, + "rewards/margins": 10.772356033325195, + "rewards/rejected": -11.764204025268555, + "step": 15720 + }, + { + "epoch": 0.94, + "learning_rate": 5.8348360751677435e-08, + "logits/chosen": -2.644047498703003, + "logits/rejected": -2.196828603744507, + "logps/chosen": -172.4115447998047, + "logps/rejected": -1205.8841552734375, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9714924097061157, + "rewards/margins": 10.647869110107422, + "rewards/rejected": -11.619361877441406, + "step": 15730 + }, + { + "epoch": 0.94, + "learning_rate": 5.7235952341026524e-08, + "logits/chosen": -2.675264596939087, + "logits/rejected": -2.2065622806549072, + "logps/chosen": -179.26968383789062, + "logps/rejected": -1173.582763671875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0718376636505127, + "rewards/margins": 10.227112770080566, + "rewards/rejected": -11.298949241638184, + "step": 15740 + }, + { + "epoch": 0.94, + "learning_rate": 5.6134127446211275e-08, + "logits/chosen": -2.6677443981170654, + "logits/rejected": -2.267427444458008, + "logps/chosen": -163.35110473632812, + "logps/rejected": -1212.1041259765625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9990060925483704, + "rewards/margins": 10.684392929077148, + "rewards/rejected": -11.68339729309082, + "step": 15750 + }, + { + "epoch": 0.94, + "learning_rate": 5.5042890840996676e-08, + "logits/chosen": -2.7018380165100098, + "logits/rejected": -2.2311275005340576, + "logps/chosen": -178.52374267578125, + "logps/rejected": -1192.3687744140625, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1306133270263672, + "rewards/margins": 10.351810455322266, + "rewards/rejected": -11.482422828674316, + "step": 15760 + }, + { + "epoch": 0.94, + "learning_rate": 5.3962247253273035e-08, + "logits/chosen": -2.673051118850708, + "logits/rejected": -2.1973841190338135, + "logps/chosen": -228.45956420898438, + "logps/rejected": -1167.745849609375, + "loss": 0.0206, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.564034342765808, + "rewards/margins": 9.680739402770996, + "rewards/rejected": -11.24477481842041, + "step": 15770 + }, + { + "epoch": 0.94, + "learning_rate": 5.2892201365035144e-08, + "logits/chosen": -2.6698529720306396, + "logits/rejected": -2.263176202774048, + "logps/chosen": -178.38954162597656, + "logps/rejected": -1230.742431640625, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1097861528396606, + "rewards/margins": 10.761785507202148, + "rewards/rejected": -11.871572494506836, + "step": 15780 + }, + { + "epoch": 0.94, + "learning_rate": 5.18327578123623e-08, + "logits/chosen": -2.6756222248077393, + "logits/rejected": -2.244558334350586, + "logps/chosen": -159.38980102539062, + "logps/rejected": -1191.995361328125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8802758455276489, + "rewards/margins": 10.607393264770508, + "rewards/rejected": -11.487669944763184, + "step": 15790 + }, + { + "epoch": 0.94, + "learning_rate": 5.078392118539777e-08, + "logits/chosen": -2.6724026203155518, + "logits/rejected": -2.255080461502075, + "logps/chosen": -188.29446411132812, + "logps/rejected": -1205.693115234375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1916935443878174, + "rewards/margins": 10.421567916870117, + "rewards/rejected": -11.613263130187988, + "step": 15800 + }, + { + "epoch": 0.94, + "learning_rate": 4.974569602832991e-08, + "logits/chosen": -2.6580288410186768, + "logits/rejected": -2.2031607627868652, + "logps/chosen": -172.54359436035156, + "logps/rejected": -1260.949951171875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.063873052597046, + "rewards/margins": 11.100789070129395, + "rewards/rejected": -12.16466236114502, + "step": 15810 + }, + { + "epoch": 0.94, + "learning_rate": 4.8718086839370794e-08, + "logits/chosen": -2.673914670944214, + "logits/rejected": -2.2175183296203613, + "logps/chosen": -175.0134735107422, + "logps/rejected": -1311.15771484375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0596094131469727, + "rewards/margins": 11.605969429016113, + "rewards/rejected": -12.66557788848877, + "step": 15820 + }, + { + "epoch": 0.94, + "learning_rate": 4.7701098070739304e-08, + "logits/chosen": -2.6465909481048584, + "logits/rejected": -2.2616775035858154, + "logps/chosen": -174.7118377685547, + "logps/rejected": -1199.425537109375, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.057864785194397, + "rewards/margins": 10.501089096069336, + "rewards/rejected": -11.558954238891602, + "step": 15830 + }, + { + "epoch": 0.94, + "learning_rate": 4.66947341286389e-08, + "logits/chosen": -2.6740498542785645, + "logits/rejected": -2.178858518600464, + "logps/chosen": -163.44229125976562, + "logps/rejected": -1386.80908203125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9184477925300598, + "rewards/margins": 12.485325813293457, + "rewards/rejected": -13.403773307800293, + "step": 15840 + }, + { + "epoch": 0.95, + "learning_rate": 4.5698999373240404e-08, + "logits/chosen": -2.6467292308807373, + "logits/rejected": -2.2230963706970215, + "logps/chosen": -184.9004364013672, + "logps/rejected": -1163.9185791015625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1377449035644531, + "rewards/margins": 10.049426078796387, + "rewards/rejected": -11.18717098236084, + "step": 15850 + }, + { + "epoch": 0.95, + "learning_rate": 4.471389811866289e-08, + "logits/chosen": -2.6821835041046143, + "logits/rejected": -2.218871593475342, + "logps/chosen": -180.2025146484375, + "logps/rejected": -1147.4190673828125, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.142691969871521, + "rewards/margins": 9.896963119506836, + "rewards/rejected": -11.039654731750488, + "step": 15860 + }, + { + "epoch": 0.95, + "learning_rate": 4.373943463295477e-08, + "logits/chosen": -2.6431102752685547, + "logits/rejected": -2.2247185707092285, + "logps/chosen": -161.41119384765625, + "logps/rejected": -1239.714111328125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9377598762512207, + "rewards/margins": 11.023923873901367, + "rewards/rejected": -11.961685180664062, + "step": 15870 + }, + { + "epoch": 0.95, + "learning_rate": 4.277561313807493e-08, + "logits/chosen": -2.6661829948425293, + "logits/rejected": -2.195704221725464, + "logps/chosen": -176.55194091796875, + "logps/rejected": -1253.5799560546875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0512021780014038, + "rewards/margins": 11.044206619262695, + "rewards/rejected": -12.095407485961914, + "step": 15880 + }, + { + "epoch": 0.95, + "learning_rate": 4.1822437809874994e-08, + "logits/chosen": -2.657374143600464, + "logits/rejected": -2.2294418811798096, + "logps/chosen": -179.5014190673828, + "logps/rejected": -1220.29931640625, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1006819009780884, + "rewards/margins": 10.661455154418945, + "rewards/rejected": -11.762137413024902, + "step": 15890 + }, + { + "epoch": 0.95, + "learning_rate": 4.0879912778080956e-08, + "logits/chosen": -2.6456456184387207, + "logits/rejected": -2.2596583366394043, + "logps/chosen": -181.09066772460938, + "logps/rejected": -1146.300537109375, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1637568473815918, + "rewards/margins": 9.87397575378418, + "rewards/rejected": -11.037734031677246, + "step": 15900 + }, + { + "epoch": 0.95, + "learning_rate": 3.994804212627462e-08, + "logits/chosen": -2.651865005493164, + "logits/rejected": -2.2323803901672363, + "logps/chosen": -175.99172973632812, + "logps/rejected": -1316.3236083984375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0354217290878296, + "rewards/margins": 11.680215835571289, + "rewards/rejected": -12.71563720703125, + "step": 15910 + }, + { + "epoch": 0.95, + "learning_rate": 3.902682989187889e-08, + "logits/chosen": -2.6795008182525635, + "logits/rejected": -2.213805675506592, + "logps/chosen": -160.47288513183594, + "logps/rejected": -1261.33642578125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8807178735733032, + "rewards/margins": 11.292486190795898, + "rewards/rejected": -12.17320442199707, + "step": 15920 + }, + { + "epoch": 0.95, + "learning_rate": 3.8116280066134994e-08, + "logits/chosen": -2.6878819465637207, + "logits/rejected": -2.2324771881103516, + "logps/chosen": -162.35952758789062, + "logps/rejected": -1327.2325439453125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9276419878005981, + "rewards/margins": 11.898499488830566, + "rewards/rejected": -12.826141357421875, + "step": 15930 + }, + { + "epoch": 0.95, + "learning_rate": 3.721639659409054e-08, + "logits/chosen": -2.6434741020202637, + "logits/rejected": -2.2681822776794434, + "logps/chosen": -172.68704223632812, + "logps/rejected": -1200.7523193359375, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0266098976135254, + "rewards/margins": 10.549015998840332, + "rewards/rejected": -11.5756254196167, + "step": 15940 + }, + { + "epoch": 0.95, + "learning_rate": 3.63271833745793e-08, + "logits/chosen": -2.6975810527801514, + "logits/rejected": -2.238269329071045, + "logps/chosen": -161.9984588623047, + "logps/rejected": -1281.224853515625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8777400255203247, + "rewards/margins": 11.489442825317383, + "rewards/rejected": -12.367182731628418, + "step": 15950 + }, + { + "epoch": 0.95, + "learning_rate": 3.544864426020478e-08, + "logits/chosen": -2.676342487335205, + "logits/rejected": -2.294611692428589, + "logps/chosen": -176.2101287841797, + "logps/rejected": -1235.3106689453125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0305867195129395, + "rewards/margins": 10.878564834594727, + "rewards/rejected": -11.909151077270508, + "step": 15960 + }, + { + "epoch": 0.95, + "learning_rate": 3.4580783057324706e-08, + "logits/chosen": -2.6921143531799316, + "logits/rejected": -2.222726821899414, + "logps/chosen": -186.8076171875, + "logps/rejected": -1137.28564453125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.130562663078308, + "rewards/margins": 9.798645973205566, + "rewards/rejected": -10.929207801818848, + "step": 15970 + }, + { + "epoch": 0.95, + "learning_rate": 3.3723603526032435e-08, + "logits/chosen": -2.639613151550293, + "logits/rejected": -2.164780616760254, + "logps/chosen": -178.20245361328125, + "logps/rejected": -1183.5435791015625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0918643474578857, + "rewards/margins": 10.308358192443848, + "rewards/rejected": -11.400223731994629, + "step": 15980 + }, + { + "epoch": 0.95, + "learning_rate": 3.2877109380143604e-08, + "logits/chosen": -2.694697380065918, + "logits/rejected": -2.214928150177002, + "logps/chosen": -181.5031280517578, + "logps/rejected": -1273.8643798828125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1214652061462402, + "rewards/margins": 11.179290771484375, + "rewards/rejected": -12.300756454467773, + "step": 15990 + }, + { + "epoch": 0.95, + "learning_rate": 3.204130428717672e-08, + "logits/chosen": -2.70440411567688, + "logits/rejected": -2.1867878437042236, + "logps/chosen": -180.6644744873047, + "logps/rejected": -1208.783203125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0519477128982544, + "rewards/margins": 10.596542358398438, + "rewards/rejected": -11.648488998413086, + "step": 16000 + }, + { + "epoch": 0.95, + "learning_rate": 3.121619186834041e-08, + "logits/chosen": -2.6858105659484863, + "logits/rejected": -2.246583938598633, + "logps/chosen": -205.57125854492188, + "logps/rejected": -1266.3936767578125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3817111253738403, + "rewards/margins": 10.831425666809082, + "rewards/rejected": -12.213135719299316, + "step": 16010 + }, + { + "epoch": 0.96, + "learning_rate": 3.040177569851477e-08, + "logits/chosen": -2.61903715133667, + "logits/rejected": -2.208021879196167, + "logps/chosen": -174.45413208007812, + "logps/rejected": -1214.1480712890625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0142533779144287, + "rewards/margins": 10.69257926940918, + "rewards/rejected": -11.706830978393555, + "step": 16020 + }, + { + "epoch": 0.96, + "learning_rate": 2.9598059306238658e-08, + "logits/chosen": -2.647799015045166, + "logits/rejected": -2.1788220405578613, + "logps/chosen": -170.56747436523438, + "logps/rejected": -1287.8985595703125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9761293530464172, + "rewards/margins": 11.46005916595459, + "rewards/rejected": -12.436187744140625, + "step": 16030 + }, + { + "epoch": 0.96, + "learning_rate": 2.8805046173692176e-08, + "logits/chosen": -2.6659646034240723, + "logits/rejected": -2.2203304767608643, + "logps/chosen": -167.7789764404297, + "logps/rejected": -1295.4447021484375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9377069473266602, + "rewards/margins": 11.559220314025879, + "rewards/rejected": -12.496925354003906, + "step": 16040 + }, + { + "epoch": 0.96, + "learning_rate": 2.802273973668279e-08, + "logits/chosen": -2.655034303665161, + "logits/rejected": -2.268833875656128, + "logps/chosen": -166.41989135742188, + "logps/rejected": -1366.1336669921875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9681248664855957, + "rewards/margins": 12.258781433105469, + "rewards/rejected": -13.226905822753906, + "step": 16050 + }, + { + "epoch": 0.96, + "learning_rate": 2.725114338463064e-08, + "logits/chosen": -2.7416205406188965, + "logits/rejected": -2.2912113666534424, + "logps/chosen": -166.6707000732422, + "logps/rejected": -1312.750732421875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9457403421401978, + "rewards/margins": 11.732978820800781, + "rewards/rejected": -12.678718566894531, + "step": 16060 + }, + { + "epoch": 0.96, + "learning_rate": 2.6490260460552143e-08, + "logits/chosen": -2.6333775520324707, + "logits/rejected": -2.215641736984253, + "logps/chosen": -173.98220825195312, + "logps/rejected": -1290.986328125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9858376383781433, + "rewards/margins": 11.479873657226562, + "rewards/rejected": -12.46571159362793, + "step": 16070 + }, + { + "epoch": 0.96, + "learning_rate": 2.5740094261048342e-08, + "logits/chosen": -2.629335880279541, + "logits/rejected": -2.2283644676208496, + "logps/chosen": -198.64735412597656, + "logps/rejected": -1200.9288330078125, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3244969844818115, + "rewards/margins": 10.241098403930664, + "rewards/rejected": -11.565595626831055, + "step": 16080 + }, + { + "epoch": 0.96, + "learning_rate": 2.5000648036287712e-08, + "logits/chosen": -2.690540075302124, + "logits/rejected": -2.249478816986084, + "logps/chosen": -185.4785919189453, + "logps/rejected": -1211.8551025390625, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1727367639541626, + "rewards/margins": 10.489535331726074, + "rewards/rejected": -11.662272453308105, + "step": 16090 + }, + { + "epoch": 0.96, + "learning_rate": 2.4271924989993646e-08, + "logits/chosen": -2.6665353775024414, + "logits/rejected": -2.1682610511779785, + "logps/chosen": -161.706298828125, + "logps/rejected": -1247.8104248046875, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8923938870429993, + "rewards/margins": 11.139763832092285, + "rewards/rejected": -12.032156944274902, + "step": 16100 + }, + { + "epoch": 0.96, + "learning_rate": 2.3553928279431147e-08, + "logits/chosen": -2.7035655975341797, + "logits/rejected": -2.241546869277954, + "logps/chosen": -179.96670532226562, + "logps/rejected": -1372.9996337890625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0802112817764282, + "rewards/margins": 12.1927490234375, + "rewards/rejected": -13.27295970916748, + "step": 16110 + }, + { + "epoch": 0.96, + "learning_rate": 2.284666101539129e-08, + "logits/chosen": -2.6543209552764893, + "logits/rejected": -2.226243495941162, + "logps/chosen": -157.00765991210938, + "logps/rejected": -1217.953857421875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8477692604064941, + "rewards/margins": 10.890615463256836, + "rewards/rejected": -11.738385200500488, + "step": 16120 + }, + { + "epoch": 0.96, + "learning_rate": 2.2150126262179273e-08, + "logits/chosen": -2.679105758666992, + "logits/rejected": -2.259007215499878, + "logps/chosen": -170.59732055664062, + "logps/rejected": -1319.4207763671875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0483429431915283, + "rewards/margins": 11.701032638549805, + "rewards/rejected": -12.74937629699707, + "step": 16130 + }, + { + "epoch": 0.96, + "learning_rate": 2.1464327037600264e-08, + "logits/chosen": -2.667034149169922, + "logits/rejected": -2.145745038986206, + "logps/chosen": -173.6420440673828, + "logps/rejected": -1251.659912109375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.974643349647522, + "rewards/margins": 11.09910774230957, + "rewards/rejected": -12.073748588562012, + "step": 16140 + }, + { + "epoch": 0.96, + "learning_rate": 2.0789266312947477e-08, + "logits/chosen": -2.6917691230773926, + "logits/rejected": -2.3492634296417236, + "logps/chosen": -182.48651123046875, + "logps/rejected": -1119.5035400390625, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1133863925933838, + "rewards/margins": 9.64062786102295, + "rewards/rejected": -10.75401496887207, + "step": 16150 + }, + { + "epoch": 0.96, + "learning_rate": 2.0124947012987172e-08, + "logits/chosen": -2.644659996032715, + "logits/rejected": -2.214852809906006, + "logps/chosen": -152.89157104492188, + "logps/rejected": -1245.90966796875, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8423296213150024, + "rewards/margins": 11.1737060546875, + "rewards/rejected": -12.016035079956055, + "step": 16160 + }, + { + "epoch": 0.96, + "learning_rate": 1.947137201594923e-08, + "logits/chosen": -2.6840062141418457, + "logits/rejected": -2.3002514839172363, + "logps/chosen": -184.42825317382812, + "logps/rejected": -1231.1353759765625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.095862627029419, + "rewards/margins": 10.771964073181152, + "rewards/rejected": -11.867825508117676, + "step": 16170 + }, + { + "epoch": 0.96, + "learning_rate": 1.8828544153510765e-08, + "logits/chosen": -2.705512285232544, + "logits/rejected": -2.3017754554748535, + "logps/chosen": -188.9728240966797, + "logps/rejected": -1201.132568359375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2116485834121704, + "rewards/margins": 10.357165336608887, + "rewards/rejected": -11.568815231323242, + "step": 16180 + }, + { + "epoch": 0.97, + "learning_rate": 1.8196466210787245e-08, + "logits/chosen": -2.707188129425049, + "logits/rejected": -2.274749755859375, + "logps/chosen": -163.2353057861328, + "logps/rejected": -1249.7904052734375, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9587360620498657, + "rewards/margins": 11.098140716552734, + "rewards/rejected": -12.056876182556152, + "step": 16190 + }, + { + "epoch": 0.97, + "learning_rate": 1.7575140926318346e-08, + "logits/chosen": -2.6321167945861816, + "logits/rejected": -2.2132019996643066, + "logps/chosen": -193.10452270507812, + "logps/rejected": -1155.3184814453125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2553688287734985, + "rewards/margins": 9.859628677368164, + "rewards/rejected": -11.114996910095215, + "step": 16200 + }, + { + "epoch": 0.97, + "learning_rate": 1.6964570992057394e-08, + "logits/chosen": -2.6683545112609863, + "logits/rejected": -2.2522847652435303, + "logps/chosen": -173.28628540039062, + "logps/rejected": -1343.628173828125, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.075852632522583, + "rewards/margins": 11.91985034942627, + "rewards/rejected": -12.995704650878906, + "step": 16210 + }, + { + "epoch": 0.97, + "learning_rate": 1.6364759053358603e-08, + "logits/chosen": -2.6427385807037354, + "logits/rejected": -2.2167510986328125, + "logps/chosen": -161.57571411132812, + "logps/rejected": -1264.093505859375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9353412389755249, + "rewards/margins": 11.257986068725586, + "rewards/rejected": -12.193326950073242, + "step": 16220 + }, + { + "epoch": 0.97, + "learning_rate": 1.5775707708966247e-08, + "logits/chosen": -2.650177478790283, + "logits/rejected": -2.1728808879852295, + "logps/chosen": -172.66888427734375, + "logps/rejected": -1184.2003173828125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0020825862884521, + "rewards/margins": 10.396360397338867, + "rewards/rejected": -11.398443222045898, + "step": 16230 + }, + { + "epoch": 0.97, + "learning_rate": 1.5197419511003564e-08, + "logits/chosen": -2.6753103733062744, + "logits/rejected": -2.2551887035369873, + "logps/chosen": -212.41061401367188, + "logps/rejected": -1316.4437255859375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3922874927520752, + "rewards/margins": 11.326181411743164, + "rewards/rejected": -12.718469619750977, + "step": 16240 + }, + { + "epoch": 0.97, + "learning_rate": 1.4629896964960533e-08, + "logits/chosen": -2.603825807571411, + "logits/rejected": -2.1976101398468018, + "logps/chosen": -161.50404357910156, + "logps/rejected": -1155.7236328125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9237446784973145, + "rewards/margins": 10.20910930633545, + "rewards/rejected": -11.132853507995605, + "step": 16250 + }, + { + "epoch": 0.97, + "learning_rate": 1.4073142529685003e-08, + "logits/chosen": -2.6120095252990723, + "logits/rejected": -2.1848855018615723, + "logps/chosen": -182.35549926757812, + "logps/rejected": -1240.063232421875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.036584734916687, + "rewards/margins": 10.91883659362793, + "rewards/rejected": -11.955423355102539, + "step": 16260 + }, + { + "epoch": 0.97, + "learning_rate": 1.3527158617370196e-08, + "logits/chosen": -2.6554369926452637, + "logits/rejected": -2.2158074378967285, + "logps/chosen": -169.08436584472656, + "logps/rejected": -1240.2252197265625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0364867448806763, + "rewards/margins": 10.917577743530273, + "rewards/rejected": -11.954065322875977, + "step": 16270 + }, + { + "epoch": 0.97, + "learning_rate": 1.2991947593545273e-08, + "logits/chosen": -2.6749796867370605, + "logits/rejected": -2.2231011390686035, + "logps/chosen": -182.36309814453125, + "logps/rejected": -1285.9312744140625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0808775424957275, + "rewards/margins": 11.342862129211426, + "rewards/rejected": -12.42374038696289, + "step": 16280 + }, + { + "epoch": 0.97, + "learning_rate": 1.2467511777064789e-08, + "logits/chosen": -2.651242971420288, + "logits/rejected": -2.2104105949401855, + "logps/chosen": -177.27374267578125, + "logps/rejected": -1284.770263671875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9978905916213989, + "rewards/margins": 11.40503978729248, + "rewards/rejected": -12.402929306030273, + "step": 16290 + }, + { + "epoch": 0.97, + "learning_rate": 1.1953853440098418e-08, + "logits/chosen": -2.6822965145111084, + "logits/rejected": -2.2353248596191406, + "logps/chosen": -158.13011169433594, + "logps/rejected": -1288.239501953125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.840979278087616, + "rewards/margins": 11.592016220092773, + "rewards/rejected": -12.43299388885498, + "step": 16300 + }, + { + "epoch": 0.97, + "learning_rate": 1.145097480812124e-08, + "logits/chosen": -2.6494178771972656, + "logits/rejected": -2.2277884483337402, + "logps/chosen": -202.35888671875, + "logps/rejected": -1159.5748291015625, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3133056163787842, + "rewards/margins": 9.840926170349121, + "rewards/rejected": -11.154231071472168, + "step": 16310 + }, + { + "epoch": 0.97, + "learning_rate": 1.0958878059905143e-08, + "logits/chosen": -2.70688533782959, + "logits/rejected": -2.195460081100464, + "logps/chosen": -164.93997192382812, + "logps/rejected": -1241.07080078125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9415566325187683, + "rewards/margins": 11.026819229125977, + "rewards/rejected": -11.968377113342285, + "step": 16320 + }, + { + "epoch": 0.97, + "learning_rate": 1.0477565327507155e-08, + "logits/chosen": -2.756645679473877, + "logits/rejected": -2.282986879348755, + "logps/chosen": -170.934814453125, + "logps/rejected": -1251.6358642578125, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0068751573562622, + "rewards/margins": 11.066400527954102, + "rewards/rejected": -12.073274612426758, + "step": 16330 + }, + { + "epoch": 0.97, + "learning_rate": 1.0007038696262517e-08, + "logits/chosen": -2.7095096111297607, + "logits/rejected": -2.1451876163482666, + "logps/chosen": -162.7404327392578, + "logps/rejected": -1209.6416015625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9350031614303589, + "rewards/margins": 10.727781295776367, + "rewards/rejected": -11.662785530090332, + "step": 16340 + }, + { + "epoch": 0.97, + "learning_rate": 9.547300204773845e-09, + "logits/chosen": -2.647977352142334, + "logits/rejected": -2.137451171875, + "logps/chosen": -186.81507873535156, + "logps/rejected": -1232.0718994140625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.140344262123108, + "rewards/margins": 10.737279891967773, + "rewards/rejected": -11.877625465393066, + "step": 16350 + }, + { + "epoch": 0.98, + "learning_rate": 9.098351844903653e-09, + "logits/chosen": -2.633021593093872, + "logits/rejected": -2.2412657737731934, + "logps/chosen": -191.61239624023438, + "logps/rejected": -1286.171875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1795886754989624, + "rewards/margins": 11.230447769165039, + "rewards/rejected": -12.410036087036133, + "step": 16360 + }, + { + "epoch": 0.98, + "learning_rate": 8.660195561764617e-09, + "logits/chosen": -2.6692349910736084, + "logits/rejected": -2.1601152420043945, + "logps/chosen": -182.53131103515625, + "logps/rejected": -1202.576416015625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1304045915603638, + "rewards/margins": 10.447057723999023, + "rewards/rejected": -11.577461242675781, + "step": 16370 + }, + { + "epoch": 0.98, + "learning_rate": 8.232833253712657e-09, + "logits/chosen": -2.642995834350586, + "logits/rejected": -2.2354342937469482, + "logps/chosen": -164.24864196777344, + "logps/rejected": -1302.755859375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9687468409538269, + "rewards/margins": 11.626684188842773, + "rewards/rejected": -12.595431327819824, + "step": 16380 + }, + { + "epoch": 0.98, + "learning_rate": 7.816266772336378e-09, + "logits/chosen": -2.655147075653076, + "logits/rejected": -2.253140687942505, + "logps/chosen": -182.57879638671875, + "logps/rejected": -1224.9423828125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1447583436965942, + "rewards/margins": 10.65778923034668, + "rewards/rejected": -11.802547454833984, + "step": 16390 + }, + { + "epoch": 0.98, + "learning_rate": 7.410497922451243e-09, + "logits/chosen": -2.670452833175659, + "logits/rejected": -2.2061877250671387, + "logps/chosen": -169.3420867919922, + "logps/rejected": -1353.225341796875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9867624044418335, + "rewards/margins": 12.098861694335938, + "rewards/rejected": -13.085622787475586, + "step": 16400 + }, + { + "epoch": 0.98, + "learning_rate": 7.015528462091248e-09, + "logits/chosen": -2.639734983444214, + "logits/rejected": -2.2123537063598633, + "logps/chosen": -184.6167449951172, + "logps/rejected": -1259.0152587890625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.102648377418518, + "rewards/margins": 11.055513381958008, + "rewards/rejected": -12.158160209655762, + "step": 16410 + }, + { + "epoch": 0.98, + "learning_rate": 6.63136010250004e-09, + "logits/chosen": -2.6688857078552246, + "logits/rejected": -2.2339518070220947, + "logps/chosen": -173.97569274902344, + "logps/rejected": -1313.511962890625, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0491271018981934, + "rewards/margins": 11.649115562438965, + "rewards/rejected": -12.6982421875, + "step": 16420 + }, + { + "epoch": 0.98, + "learning_rate": 6.257994508124532e-09, + "logits/chosen": -2.647425413131714, + "logits/rejected": -2.2450172901153564, + "logps/chosen": -153.29281616210938, + "logps/rejected": -1286.2093505859375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.867265522480011, + "rewards/margins": 11.552124977111816, + "rewards/rejected": -12.419389724731445, + "step": 16430 + }, + { + "epoch": 0.98, + "learning_rate": 5.895433296608799e-09, + "logits/chosen": -2.6959025859832764, + "logits/rejected": -2.232936382293701, + "logps/chosen": -168.3072509765625, + "logps/rejected": -1244.86767578125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9388321042060852, + "rewards/margins": 11.074982643127441, + "rewards/rejected": -12.013814926147461, + "step": 16440 + }, + { + "epoch": 0.98, + "learning_rate": 5.543678038784361e-09, + "logits/chosen": -2.6504616737365723, + "logits/rejected": -2.2044098377227783, + "logps/chosen": -176.8509979248047, + "logps/rejected": -1223.298583984375, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0913746356964111, + "rewards/margins": 10.703654289245605, + "rewards/rejected": -11.795029640197754, + "step": 16450 + }, + { + "epoch": 0.98, + "learning_rate": 5.202730258665745e-09, + "logits/chosen": -2.665109157562256, + "logits/rejected": -2.223554849624634, + "logps/chosen": -165.9916534423828, + "logps/rejected": -1278.921142578125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9834840893745422, + "rewards/margins": 11.362183570861816, + "rewards/rejected": -12.345666885375977, + "step": 16460 + }, + { + "epoch": 0.98, + "learning_rate": 4.872591433442708e-09, + "logits/chosen": -2.6509833335876465, + "logits/rejected": -2.2273030281066895, + "logps/chosen": -160.29881286621094, + "logps/rejected": -1179.502685546875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8968982696533203, + "rewards/margins": 10.453070640563965, + "rewards/rejected": -11.349969863891602, + "step": 16470 + }, + { + "epoch": 0.98, + "learning_rate": 4.5532629934744166e-09, + "logits/chosen": -2.679215908050537, + "logits/rejected": -2.254709005355835, + "logps/chosen": -176.91891479492188, + "logps/rejected": -1366.59716796875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0106145143508911, + "rewards/margins": 12.200166702270508, + "rewards/rejected": -13.210782051086426, + "step": 16480 + }, + { + "epoch": 0.98, + "learning_rate": 4.244746322282501e-09, + "logits/chosen": -2.6910905838012695, + "logits/rejected": -2.267216444015503, + "logps/chosen": -178.09024047851562, + "logps/rejected": -1130.768798828125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1032586097717285, + "rewards/margins": 9.755863189697266, + "rewards/rejected": -10.859121322631836, + "step": 16490 + }, + { + "epoch": 0.98, + "learning_rate": 3.94704275654606e-09, + "logits/chosen": -2.6660170555114746, + "logits/rejected": -2.231707811355591, + "logps/chosen": -167.55545043945312, + "logps/rejected": -1288.8323974609375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9682508707046509, + "rewards/margins": 11.475881576538086, + "rewards/rejected": -12.444130897521973, + "step": 16500 + }, + { + "epoch": 0.98, + "learning_rate": 3.6601535860950053e-09, + "logits/chosen": -2.6902756690979004, + "logits/rejected": -2.18860125541687, + "logps/chosen": -172.37490844726562, + "logps/rejected": -1385.1578369140625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9987273216247559, + "rewards/margins": 12.404215812683105, + "rewards/rejected": -13.402941703796387, + "step": 16510 + }, + { + "epoch": 0.99, + "learning_rate": 3.3840800539047815e-09, + "logits/chosen": -2.6335432529449463, + "logits/rejected": -2.2267251014709473, + "logps/chosen": -174.3216552734375, + "logps/rejected": -1261.455322265625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0557180643081665, + "rewards/margins": 11.113521575927734, + "rewards/rejected": -12.16923999786377, + "step": 16520 + }, + { + "epoch": 0.99, + "learning_rate": 3.1188233560913717e-09, + "logits/chosen": -2.6585869789123535, + "logits/rejected": -2.2412328720092773, + "logps/chosen": -193.1375274658203, + "logps/rejected": -1241.5718994140625, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2179319858551025, + "rewards/margins": 10.76350212097168, + "rewards/rejected": -11.981435775756836, + "step": 16530 + }, + { + "epoch": 0.99, + "learning_rate": 2.8643846419057484e-09, + "logits/chosen": -2.672546625137329, + "logits/rejected": -2.2495222091674805, + "logps/chosen": -192.04794311523438, + "logps/rejected": -1263.3785400390625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1814525127410889, + "rewards/margins": 11.017091751098633, + "rewards/rejected": -12.1985445022583, + "step": 16540 + }, + { + "epoch": 0.99, + "learning_rate": 2.6207650137283215e-09, + "logits/chosen": -2.708150625228882, + "logits/rejected": -2.2877864837646484, + "logps/chosen": -173.510009765625, + "logps/rejected": -1366.581787109375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0227696895599365, + "rewards/margins": 12.202122688293457, + "rewards/rejected": -13.224891662597656, + "step": 16550 + }, + { + "epoch": 0.99, + "learning_rate": 2.3879655270650504e-09, + "logits/chosen": -2.7082462310791016, + "logits/rejected": -2.171170473098755, + "logps/chosen": -196.70620727539062, + "logps/rejected": -1233.5294189453125, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2639291286468506, + "rewards/margins": 10.629531860351562, + "rewards/rejected": -11.893461227416992, + "step": 16560 + }, + { + "epoch": 0.99, + "learning_rate": 2.1659871905430064e-09, + "logits/chosen": -2.6711885929107666, + "logits/rejected": -2.2594029903411865, + "logps/chosen": -180.36941528320312, + "logps/rejected": -1367.834228515625, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.047650694847107, + "rewards/margins": 12.184184074401855, + "rewards/rejected": -13.231834411621094, + "step": 16570 + }, + { + "epoch": 0.99, + "learning_rate": 1.954830965905097e-09, + "logits/chosen": -2.6764516830444336, + "logits/rejected": -2.23189115524292, + "logps/chosen": -172.1129913330078, + "logps/rejected": -1113.990478515625, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.040505051612854, + "rewards/margins": 9.66499137878418, + "rewards/rejected": -10.705495834350586, + "step": 16580 + }, + { + "epoch": 0.99, + "learning_rate": 1.7544977680064578e-09, + "logits/chosen": -2.652247428894043, + "logits/rejected": -2.215662717819214, + "logps/chosen": -157.01705932617188, + "logps/rejected": -1146.182861328125, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9195022583007812, + "rewards/margins": 10.094548225402832, + "rewards/rejected": -11.014049530029297, + "step": 16590 + }, + { + "epoch": 0.99, + "learning_rate": 1.564988464810291e-09, + "logits/chosen": -2.6204469203948975, + "logits/rejected": -2.241234064102173, + "logps/chosen": -156.6399688720703, + "logps/rejected": -1244.577392578125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8605591654777527, + "rewards/margins": 11.15047550201416, + "rewards/rejected": -12.011034965515137, + "step": 16600 + }, + { + "epoch": 0.99, + "learning_rate": 1.386303877384254e-09, + "logits/chosen": -2.6458616256713867, + "logits/rejected": -2.1999783515930176, + "logps/chosen": -176.6503448486328, + "logps/rejected": -1230.3421630859375, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0806583166122437, + "rewards/margins": 10.774874687194824, + "rewards/rejected": -11.855533599853516, + "step": 16610 + }, + { + "epoch": 0.99, + "learning_rate": 1.2184447798971322e-09, + "logits/chosen": -2.688946485519409, + "logits/rejected": -2.167438507080078, + "logps/chosen": -171.19871520996094, + "logps/rejected": -1295.857421875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0013701915740967, + "rewards/margins": 11.509811401367188, + "rewards/rejected": -12.511181831359863, + "step": 16620 + }, + { + "epoch": 0.99, + "learning_rate": 1.0614118996146727e-09, + "logits/chosen": -2.695730686187744, + "logits/rejected": -2.302783489227295, + "logps/chosen": -187.93051147460938, + "logps/rejected": -1276.522216796875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1826187372207642, + "rewards/margins": 11.138666152954102, + "rewards/rejected": -12.321285247802734, + "step": 16630 + }, + { + "epoch": 0.99, + "learning_rate": 9.152059168976435e-10, + "logits/chosen": -2.654323101043701, + "logits/rejected": -2.1863253116607666, + "logps/chosen": -174.3800811767578, + "logps/rejected": -1250.937744140625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0174219608306885, + "rewards/margins": 11.043591499328613, + "rewards/rejected": -12.061013221740723, + "step": 16640 + }, + { + "epoch": 0.99, + "learning_rate": 7.798274651979465e-10, + "logits/chosen": -2.637986660003662, + "logits/rejected": -2.2469239234924316, + "logps/chosen": -196.75131225585938, + "logps/rejected": -1252.58251953125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2284270524978638, + "rewards/margins": 10.838926315307617, + "rewards/rejected": -12.067353248596191, + "step": 16650 + }, + { + "epoch": 0.99, + "learning_rate": 6.552771310558426e-10, + "logits/chosen": -2.707401990890503, + "logits/rejected": -2.188354253768921, + "logps/chosen": -187.86233520507812, + "logps/rejected": -1123.2301025390625, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1808136701583862, + "rewards/margins": 9.608713150024414, + "rewards/rejected": -10.789527893066406, + "step": 16660 + }, + { + "epoch": 0.99, + "learning_rate": 5.415554540977308e-10, + "logits/chosen": -2.6379354000091553, + "logits/rejected": -2.2245168685913086, + "logps/chosen": -176.5787353515625, + "logps/rejected": -1283.8917236328125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0409266948699951, + "rewards/margins": 11.354669570922852, + "rewards/rejected": -12.395596504211426, + "step": 16670 + }, + { + "epoch": 0.99, + "learning_rate": 4.386629270342058e-10, + "logits/chosen": -2.6777150630950928, + "logits/rejected": -2.2911205291748047, + "logps/chosen": -179.92527770996094, + "logps/rejected": -1208.0726318359375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1010576486587524, + "rewards/margins": 10.538191795349121, + "rewards/rejected": -11.639249801635742, + "step": 16680 + }, + { + "epoch": 1.0, + "learning_rate": 3.465999956575594e-10, + "logits/chosen": -2.6711153984069824, + "logits/rejected": -2.2284107208251953, + "logps/chosen": -178.85572814941406, + "logps/rejected": -1175.2252197265625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0292279720306396, + "rewards/margins": 10.2882080078125, + "rewards/rejected": -11.317437171936035, + "step": 16690 + }, + { + "epoch": 1.0, + "learning_rate": 2.653670588390056e-10, + "logits/chosen": -2.6496317386627197, + "logits/rejected": -2.1748948097229004, + "logps/chosen": -176.67767333984375, + "logps/rejected": -1217.37353515625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0714900493621826, + "rewards/margins": 10.662920951843262, + "rewards/rejected": -11.734411239624023, + "step": 16700 + }, + { + "epoch": 1.0, + "learning_rate": 1.9496446852840244e-10, + "logits/chosen": -2.668013095855713, + "logits/rejected": -2.3151652812957764, + "logps/chosen": -176.02630615234375, + "logps/rejected": -1224.7296142578125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0651683807373047, + "rewards/margins": 10.749885559082031, + "rewards/rejected": -11.815053939819336, + "step": 16710 + }, + { + "epoch": 1.0, + "learning_rate": 1.3539252975175442e-10, + "logits/chosen": -2.6090002059936523, + "logits/rejected": -2.220052719116211, + "logps/chosen": -194.02365112304688, + "logps/rejected": -1199.383056640625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2622654438018799, + "rewards/margins": 10.295656204223633, + "rewards/rejected": -11.55792236328125, + "step": 16720 + }, + { + "epoch": 1.0, + "learning_rate": 8.665150061093475e-11, + "logits/chosen": -2.6500911712646484, + "logits/rejected": -2.043184280395508, + "logps/chosen": -198.4783172607422, + "logps/rejected": -1317.258056640625, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2058560848236084, + "rewards/margins": 11.52690315246582, + "rewards/rejected": -12.732759475708008, + "step": 16730 + }, + { + "epoch": 1.0, + "learning_rate": 4.874159228063224e-11, + "logits/chosen": -2.6816859245300293, + "logits/rejected": -2.2883806228637695, + "logps/chosen": -156.26614379882812, + "logps/rejected": -1260.8797607421875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.824610710144043, + "rewards/margins": 11.33895206451416, + "rewards/rejected": -12.163562774658203, + "step": 16740 + }, + { + "epoch": 1.0, + "learning_rate": 2.1662969009461632e-11, + "logits/chosen": -2.6500821113586426, + "logits/rejected": -2.225158452987671, + "logps/chosen": -164.89157104492188, + "logps/rejected": -1216.4520263671875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9150495529174805, + "rewards/margins": 10.813158988952637, + "rewards/rejected": -11.728208541870117, + "step": 16750 + }, + { + "epoch": 1.0, + "learning_rate": 5.415748118575703e-12, + "logits/chosen": -2.6592376232147217, + "logits/rejected": -2.247807264328003, + "logps/chosen": -177.20169067382812, + "logps/rejected": -1228.7794189453125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.057569146156311, + "rewards/margins": 10.785604476928711, + "rewards/rejected": -11.843174934387207, + "step": 16760 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "logits/chosen": -2.650529623031616, + "logits/rejected": -2.262345552444458, + "logps/chosen": -170.49472045898438, + "logps/rejected": -1145.919189453125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9720252156257629, + "rewards/margins": 10.04547119140625, + "rewards/rejected": -11.017497062683105, + "step": 16770 + }, + { + "epoch": 1.0, + "step": 16770, + "total_flos": 0.0, + "train_loss": 0.03956493256323719, + "train_runtime": 68990.3604, + "train_samples_per_second": 1.945, + "train_steps_per_second": 0.243 + } + ], + "logging_steps": 10, + "max_steps": 16770, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}