{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998037291462217, "eval_steps": 500, "global_step": 2547, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.9607843137254902e-08, "logits/chosen": 0.46488896012306213, "logits/rejected": 0.6080908179283142, "logps/chosen": -248.03536987304688, "logps/rejected": -250.7172393798828, "loss": 0.0925, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.9607843137254904e-07, "logits/chosen": 0.5732576847076416, "logits/rejected": 0.5053917765617371, "logps/chosen": -283.8868103027344, "logps/rejected": -267.1470947265625, "loss": 0.0623, "rewards/accuracies": 0.42592594027519226, "rewards/chosen": 6.855401807115413e-06, "rewards/margins": -2.866806426027324e-06, "rewards/rejected": 9.722218237584457e-06, "step": 10 }, { "epoch": 0.01, "learning_rate": 3.921568627450981e-07, "logits/chosen": 0.6395555138587952, "logits/rejected": 0.5183161497116089, "logps/chosen": -259.010498046875, "logps/rejected": -230.4682159423828, "loss": 0.0524, "rewards/accuracies": 0.49166664481163025, "rewards/chosen": 4.217164314468391e-05, "rewards/margins": 4.323801113059744e-05, "rewards/rejected": -1.0663676448530168e-06, "step": 20 }, { "epoch": 0.01, "learning_rate": 5.882352941176471e-07, "logits/chosen": 0.6133766770362854, "logits/rejected": 0.5772495269775391, "logps/chosen": -268.6424255371094, "logps/rejected": -238.4060516357422, "loss": 0.0604, "rewards/accuracies": 0.429166704416275, "rewards/chosen": -2.178473187086638e-05, "rewards/margins": -3.491970710456371e-05, "rewards/rejected": 1.3134970686223824e-05, "step": 30 }, { "epoch": 0.02, "learning_rate": 7.843137254901962e-07, "logits/chosen": 0.5451570749282837, "logits/rejected": 0.6063296794891357, "logps/chosen": -270.48089599609375, "logps/rejected": -240.4629669189453, "loss": 0.0518, "rewards/accuracies": 0.429166704416275, "rewards/chosen": -5.210627568885684e-05, "rewards/margins": -1.3891922208131291e-05, "rewards/rejected": -3.821435529971495e-05, "step": 40 }, { "epoch": 0.02, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.5579402446746826, "logits/rejected": 0.6521514058113098, "logps/chosen": -308.3138732910156, "logps/rejected": -270.6910095214844, "loss": 0.0497, "rewards/accuracies": 0.49166664481163025, "rewards/chosen": -6.496746323136904e-07, "rewards/margins": 5.1773578888969496e-05, "rewards/rejected": -5.242325642029755e-05, "step": 50 }, { "epoch": 0.02, "learning_rate": 1.1764705882352942e-06, "logits/chosen": 0.5895996689796448, "logits/rejected": 0.5530626177787781, "logps/chosen": -271.9648742675781, "logps/rejected": -238.8866729736328, "loss": 0.0549, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": 4.7397597882081755e-06, "rewards/margins": 8.998825069284067e-05, "rewards/rejected": -8.5248495452106e-05, "step": 60 }, { "epoch": 0.03, "learning_rate": 1.3725490196078434e-06, "logits/chosen": 0.5712030529975891, "logits/rejected": 0.5446642637252808, "logps/chosen": -269.52349853515625, "logps/rejected": -244.25918579101562, "loss": 0.0597, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -0.00014829839346930385, "rewards/margins": 3.822638973360881e-05, "rewards/rejected": -0.00018652477592695504, "step": 70 }, { "epoch": 0.03, "learning_rate": 1.5686274509803923e-06, "logits/chosen": 0.5794577598571777, "logits/rejected": 0.6224395632743835, "logps/chosen": -262.1968688964844, "logps/rejected": -252.41336059570312, "loss": 0.0772, "rewards/accuracies": 0.4833333492279053, "rewards/chosen": -0.00011436897329986095, "rewards/margins": 8.153673115884885e-05, "rewards/rejected": -0.00019590572628658265, "step": 80 }, { "epoch": 0.04, "learning_rate": 1.7647058823529414e-06, "logits/chosen": 0.607342541217804, "logits/rejected": 0.6603871583938599, "logps/chosen": -282.23468017578125, "logps/rejected": -247.54745483398438, "loss": 0.0475, "rewards/accuracies": 0.5083333253860474, "rewards/chosen": -7.813036063453183e-05, "rewards/margins": 0.00024410069454461336, "rewards/rejected": -0.0003222310624551028, "step": 90 }, { "epoch": 0.04, "learning_rate": 1.96078431372549e-06, "logits/chosen": 0.5872822999954224, "logits/rejected": 0.512497067451477, "logps/chosen": -297.87933349609375, "logps/rejected": -265.9036560058594, "loss": 0.0574, "rewards/accuracies": 0.491666704416275, "rewards/chosen": -0.0002379983925493434, "rewards/margins": 0.0001731793163344264, "rewards/rejected": -0.0004111776943318546, "step": 100 }, { "epoch": 0.04, "learning_rate": 2.1568627450980393e-06, "logits/chosen": 0.5177062749862671, "logits/rejected": 0.5935253500938416, "logps/chosen": -284.13128662109375, "logps/rejected": -252.48641967773438, "loss": 0.0541, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": -0.0005169311771169305, "rewards/margins": 0.00025343807647004724, "rewards/rejected": -0.0007703691953793168, "step": 110 }, { "epoch": 0.05, "learning_rate": 2.3529411764705885e-06, "logits/chosen": 0.5826085209846497, "logits/rejected": 0.5734565258026123, "logps/chosen": -279.1020202636719, "logps/rejected": -242.3232879638672, "loss": 0.0615, "rewards/accuracies": 0.5208333730697632, "rewards/chosen": -0.0006812514620833099, "rewards/margins": 0.0002726772800087929, "rewards/rejected": -0.0009539287420921028, "step": 120 }, { "epoch": 0.05, "learning_rate": 2.549019607843137e-06, "logits/chosen": 0.5514948964118958, "logits/rejected": 0.6129211783409119, "logps/chosen": -297.36810302734375, "logps/rejected": -250.8700408935547, "loss": 0.0656, "rewards/accuracies": 0.5458333492279053, "rewards/chosen": -0.0008248643134720623, "rewards/margins": 0.0005141490837559104, "rewards/rejected": -0.0013390134554356337, "step": 130 }, { "epoch": 0.05, "learning_rate": 2.7450980392156867e-06, "logits/chosen": 0.5837856531143188, "logits/rejected": 0.5866830348968506, "logps/chosen": -278.97528076171875, "logps/rejected": -263.4961242675781, "loss": 0.0432, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": -0.0009940828895196319, "rewards/margins": 0.000614291406236589, "rewards/rejected": -0.001608374179340899, "step": 140 }, { "epoch": 0.06, "learning_rate": 2.9411764705882355e-06, "logits/chosen": 0.5553902983665466, "logits/rejected": 0.6875311732292175, "logps/chosen": -302.75372314453125, "logps/rejected": -287.02630615234375, "loss": 0.0439, "rewards/accuracies": 0.491666704416275, "rewards/chosen": -0.0014637492131441832, "rewards/margins": 0.00041431220597587526, "rewards/rejected": -0.001878061331808567, "step": 150 }, { "epoch": 0.06, "learning_rate": 3.1372549019607846e-06, "logits/chosen": 0.5743287205696106, "logits/rejected": 0.6214945912361145, "logps/chosen": -273.1415710449219, "logps/rejected": -248.86343383789062, "loss": 0.0478, "rewards/accuracies": 0.5708333253860474, "rewards/chosen": -0.0014291107654571533, "rewards/margins": 0.0008312534773722291, "rewards/rejected": -0.0022603641264140606, "step": 160 }, { "epoch": 0.07, "learning_rate": 3.3333333333333333e-06, "logits/chosen": 0.4786551892757416, "logits/rejected": 0.5745611190795898, "logps/chosen": -275.05743408203125, "logps/rejected": -235.23709106445312, "loss": 0.0487, "rewards/accuracies": 0.5625, "rewards/chosen": -0.001848508371040225, "rewards/margins": 0.0009307243162766099, "rewards/rejected": -0.0027792328037321568, "step": 170 }, { "epoch": 0.07, "learning_rate": 3.529411764705883e-06, "logits/chosen": 0.4652465879917145, "logits/rejected": 0.5753687620162964, "logps/chosen": -267.81134033203125, "logps/rejected": -247.77261352539062, "loss": 0.0632, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.0021076188422739506, "rewards/margins": 0.0012388969771564007, "rewards/rejected": -0.0033465158194303513, "step": 180 }, { "epoch": 0.07, "learning_rate": 3.7254901960784316e-06, "logits/chosen": 0.4822580814361572, "logits/rejected": 0.5309303998947144, "logps/chosen": -272.3936767578125, "logps/rejected": -253.17068481445312, "loss": 0.0688, "rewards/accuracies": 0.5708333849906921, "rewards/chosen": -0.0028272040653973818, "rewards/margins": 0.0016173187177628279, "rewards/rejected": -0.004444523248821497, "step": 190 }, { "epoch": 0.08, "learning_rate": 3.92156862745098e-06, "logits/chosen": 0.5600818395614624, "logits/rejected": 0.5540789365768433, "logps/chosen": -288.64727783203125, "logps/rejected": -252.9949951171875, "loss": 0.0574, "rewards/accuracies": 0.5708333253860474, "rewards/chosen": -0.00358955143019557, "rewards/margins": 0.001761708059348166, "rewards/rejected": -0.005351259373128414, "step": 200 }, { "epoch": 0.08, "learning_rate": 4.11764705882353e-06, "logits/chosen": 0.4279872477054596, "logits/rejected": 0.542730450630188, "logps/chosen": -290.51312255859375, "logps/rejected": -259.2461853027344, "loss": 0.0538, "rewards/accuracies": 0.6041667461395264, "rewards/chosen": -0.004497007466852665, "rewards/margins": 0.0022050284314900637, "rewards/rejected": -0.006702035665512085, "step": 210 }, { "epoch": 0.09, "learning_rate": 4.313725490196079e-06, "logits/chosen": 0.4644894599914551, "logits/rejected": 0.5338133573532104, "logps/chosen": -289.86248779296875, "logps/rejected": -260.25262451171875, "loss": 0.0521, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -0.005599470343440771, "rewards/margins": 0.0019665162544697523, "rewards/rejected": -0.007565985433757305, "step": 220 }, { "epoch": 0.09, "learning_rate": 4.509803921568628e-06, "logits/chosen": 0.4923954904079437, "logits/rejected": 0.5235131978988647, "logps/chosen": -298.415771484375, "logps/rejected": -281.25933837890625, "loss": 0.055, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005643138196319342, "rewards/margins": 0.002189520513638854, "rewards/rejected": -0.007832659408450127, "step": 230 }, { "epoch": 0.09, "learning_rate": 4.705882352941177e-06, "logits/chosen": 0.4870142936706543, "logits/rejected": 0.4811806082725525, "logps/chosen": -316.99267578125, "logps/rejected": -287.41534423828125, "loss": 0.0486, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.0062836273573338985, "rewards/margins": 0.0031557553447782993, "rewards/rejected": -0.009439383633434772, "step": 240 }, { "epoch": 0.1, "learning_rate": 4.901960784313726e-06, "logits/chosen": 0.5249590873718262, "logits/rejected": 0.516154408454895, "logps/chosen": -310.1789855957031, "logps/rejected": -269.47296142578125, "loss": 0.0598, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.00644350191578269, "rewards/margins": 0.00401445385068655, "rewards/rejected": -0.010457955300807953, "step": 250 }, { "epoch": 0.1, "learning_rate": 4.999941289086112e-06, "logits/chosen": 0.42474403977394104, "logits/rejected": 0.5383282899856567, "logps/chosen": -313.0384521484375, "logps/rejected": -276.7310485839844, "loss": 0.053, "rewards/accuracies": 0.6416667103767395, "rewards/chosen": -0.006402502302080393, "rewards/margins": 0.005376026965677738, "rewards/rejected": -0.011778528802096844, "step": 260 }, { "epoch": 0.11, "learning_rate": 4.999471618320339e-06, "logits/chosen": 0.4861672818660736, "logits/rejected": 0.4541027545928955, "logps/chosen": -308.2907409667969, "logps/rejected": -281.0884704589844, "loss": 0.0544, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": -0.009800055995583534, "rewards/margins": 0.005087652709335089, "rewards/rejected": -0.014887707307934761, "step": 270 }, { "epoch": 0.11, "learning_rate": 4.998532365027117e-06, "logits/chosen": 0.47271719574928284, "logits/rejected": 0.5171914100646973, "logps/chosen": -307.691650390625, "logps/rejected": -277.66546630859375, "loss": 0.0474, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.010870738886296749, "rewards/margins": 0.004916337318718433, "rewards/rejected": -0.015787076205015182, "step": 280 }, { "epoch": 0.11, "learning_rate": 4.997123705666514e-06, "logits/chosen": 0.37636083364486694, "logits/rejected": 0.45900458097457886, "logps/chosen": -302.5892639160156, "logps/rejected": -288.510498046875, "loss": 0.053, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013394233770668507, "rewards/margins": 0.005068282596766949, "rewards/rejected": -0.018462518230080605, "step": 290 }, { "epoch": 0.12, "learning_rate": 4.995245904887195e-06, "logits/chosen": 0.3868557810783386, "logits/rejected": 0.5042775273323059, "logps/chosen": -296.7281799316406, "logps/rejected": -283.5146484375, "loss": 0.0397, "rewards/accuracies": 0.5583333373069763, "rewards/chosen": -0.018061377108097076, "rewards/margins": 0.004579090513288975, "rewards/rejected": -0.022640468552708626, "step": 300 }, { "epoch": 0.12, "learning_rate": 4.992899315476696e-06, "logits/chosen": 0.45571646094322205, "logits/rejected": 0.552790641784668, "logps/chosen": -299.4598388671875, "logps/rejected": -281.2596740722656, "loss": 0.0719, "rewards/accuracies": 0.5958333611488342, "rewards/chosen": -0.015838582068681717, "rewards/margins": 0.007758588995784521, "rewards/rejected": -0.023597171530127525, "step": 310 }, { "epoch": 0.13, "learning_rate": 4.990084378295148e-06, "logits/chosen": 0.3956819474697113, "logits/rejected": 0.4794086515903473, "logps/chosen": -290.8548278808594, "logps/rejected": -266.06280517578125, "loss": 0.0533, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": -0.017532404512166977, "rewards/margins": 0.008752369321882725, "rewards/rejected": -0.026284774765372276, "step": 320 }, { "epoch": 0.13, "learning_rate": 4.986801622192453e-06, "logits/chosen": 0.3517320156097412, "logits/rejected": 0.40463319420814514, "logps/chosen": -299.07025146484375, "logps/rejected": -276.71099853515625, "loss": 0.0548, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": -0.021012626588344574, "rewards/margins": 0.00989647675305605, "rewards/rejected": -0.03090910241007805, "step": 330 }, { "epoch": 0.13, "learning_rate": 4.9830516639089226e-06, "logits/chosen": 0.33298957347869873, "logits/rejected": 0.4724348485469818, "logps/chosen": -321.5062561035156, "logps/rejected": -284.62286376953125, "loss": 0.0538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02476133033633232, "rewards/margins": 0.010583627037703991, "rewards/rejected": -0.03534495830535889, "step": 340 }, { "epoch": 0.14, "learning_rate": 4.978835207959414e-06, "logits/chosen": 0.3737282454967499, "logits/rejected": 0.34736761450767517, "logps/chosen": -336.1554260253906, "logps/rejected": -318.351318359375, "loss": 0.0401, "rewards/accuracies": 0.5916666984558105, "rewards/chosen": -0.03460691124200821, "rewards/margins": 0.013388733379542828, "rewards/rejected": -0.047995638102293015, "step": 350 }, { "epoch": 0.14, "learning_rate": 4.9741530465009665e-06, "logits/chosen": 0.19469328224658966, "logits/rejected": 0.26699501276016235, "logps/chosen": -305.50714111328125, "logps/rejected": -293.6611022949219, "loss": 0.0498, "rewards/accuracies": 0.6041666269302368, "rewards/chosen": -0.03571179881691933, "rewards/margins": 0.020327303558588028, "rewards/rejected": -0.05603910610079765, "step": 360 }, { "epoch": 0.15, "learning_rate": 4.969006059183984e-06, "logits/chosen": 0.22031648457050323, "logits/rejected": 0.17535282671451569, "logps/chosen": -339.7189025878906, "logps/rejected": -320.98211669921875, "loss": 0.0606, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04812643676996231, "rewards/margins": 0.02219363860785961, "rewards/rejected": -0.07032007724046707, "step": 370 }, { "epoch": 0.15, "learning_rate": 4.963395212986964e-06, "logits/chosen": 0.17484304308891296, "logits/rejected": 0.343574583530426, "logps/chosen": -330.0755310058594, "logps/rejected": -316.24786376953125, "loss": 0.0577, "rewards/accuracies": 0.6208333969116211, "rewards/chosen": -0.04701613634824753, "rewards/margins": 0.023009616881608963, "rewards/rejected": -0.07002574950456619, "step": 380 }, { "epoch": 0.15, "learning_rate": 4.957321562034833e-06, "logits/chosen": 0.20497021079063416, "logits/rejected": 0.14503000676631927, "logps/chosen": -324.77587890625, "logps/rejected": -312.4165954589844, "loss": 0.0476, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": -0.056038498878479004, "rewards/margins": 0.020049121230840683, "rewards/rejected": -0.07608762383460999, "step": 390 }, { "epoch": 0.16, "learning_rate": 4.950786247400908e-06, "logits/chosen": 0.14520034193992615, "logits/rejected": 0.17736613750457764, "logps/chosen": -342.2245178222656, "logps/rejected": -356.9132995605469, "loss": 0.0519, "rewards/accuracies": 0.5708333849906921, "rewards/chosen": -0.07601647078990936, "rewards/margins": 0.01825210079550743, "rewards/rejected": -0.09426857531070709, "step": 400 }, { "epoch": 0.16, "learning_rate": 4.943790496892513e-06, "logits/chosen": 0.05470971390604973, "logits/rejected": 0.11131185293197632, "logps/chosen": -342.9344482421875, "logps/rejected": -324.174560546875, "loss": 0.0469, "rewards/accuracies": 0.5916666388511658, "rewards/chosen": -0.07838026434183121, "rewards/margins": 0.02048862725496292, "rewards/rejected": -0.09886889159679413, "step": 410 }, { "epoch": 0.16, "learning_rate": 4.936335624820313e-06, "logits/chosen": 0.003329972270876169, "logits/rejected": 0.11333286762237549, "logps/chosen": -394.786865234375, "logps/rejected": -365.4358215332031, "loss": 0.0382, "rewards/accuracies": 0.5916667580604553, "rewards/chosen": -0.0723833367228508, "rewards/margins": 0.019510764628648758, "rewards/rejected": -0.09189411252737045, "step": 420 }, { "epoch": 0.17, "learning_rate": 4.9284230317513906e-06, "logits/chosen": 0.0464923158288002, "logits/rejected": 0.10706806182861328, "logps/chosen": -389.9523620605469, "logps/rejected": -353.79107666015625, "loss": 0.0344, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": -0.08335821330547333, "rewards/margins": 0.021867142990231514, "rewards/rejected": -0.10522536188364029, "step": 430 }, { "epoch": 0.17, "learning_rate": 4.920054204246116e-06, "logits/chosen": 0.033209413290023804, "logits/rejected": 0.10846443474292755, "logps/chosen": -370.37774658203125, "logps/rejected": -356.85919189453125, "loss": 0.0486, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": -0.08408842980861664, "rewards/margins": 0.023422975093126297, "rewards/rejected": -0.10751141607761383, "step": 440 }, { "epoch": 0.18, "learning_rate": 4.911230714578858e-06, "logits/chosen": 0.06528304517269135, "logits/rejected": 0.03408993408083916, "logps/chosen": -376.26800537109375, "logps/rejected": -403.13470458984375, "loss": 0.0451, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10300055891275406, "rewards/margins": 0.028705894947052002, "rewards/rejected": -0.13170644640922546, "step": 450 }, { "epoch": 0.18, "learning_rate": 4.90195422044261e-06, "logits/chosen": 0.009843332692980766, "logits/rejected": 0.08538283407688141, "logps/chosen": -380.8959045410156, "logps/rejected": -362.66143798828125, "loss": 0.0432, "rewards/accuracies": 0.595833420753479, "rewards/chosen": -0.08409236371517181, "rewards/margins": 0.024981295689940453, "rewards/rejected": -0.10907366126775742, "step": 460 }, { "epoch": 0.18, "learning_rate": 4.89222646463754e-06, "logits/chosen": -0.025348594412207603, "logits/rejected": 0.07642585784196854, "logps/chosen": -367.01043701171875, "logps/rejected": -378.41986083984375, "loss": 0.0375, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09914330393075943, "rewards/margins": 0.0271566454321146, "rewards/rejected": -0.12629994750022888, "step": 470 }, { "epoch": 0.19, "learning_rate": 4.882049274743578e-06, "logits/chosen": 0.08742884546518326, "logits/rejected": -0.006139741744846106, "logps/chosen": -395.2368469238281, "logps/rejected": -397.25640869140625, "loss": 0.0418, "rewards/accuracies": 0.5916666984558105, "rewards/chosen": -0.09513958543539047, "rewards/margins": 0.032758679240942, "rewards/rejected": -0.12789827585220337, "step": 480 }, { "epoch": 0.19, "learning_rate": 4.8714245627770515e-06, "logits/chosen": 0.0017001063097268343, "logits/rejected": 0.0077015915885567665, "logps/chosen": -409.57733154296875, "logps/rejected": -395.6904296875, "loss": 0.0474, "rewards/accuracies": 0.6208332777023315, "rewards/chosen": -0.11873996257781982, "rewards/margins": 0.029652219265699387, "rewards/rejected": -0.1483922004699707, "step": 490 }, { "epoch": 0.2, "learning_rate": 4.860354324831482e-06, "logits/chosen": -0.04007488489151001, "logits/rejected": -0.007833145558834076, "logps/chosen": -427.4029235839844, "logps/rejected": -418.568359375, "loss": 0.0494, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.13312438130378723, "rewards/margins": 0.033408962190151215, "rewards/rejected": -0.16653335094451904, "step": 500 }, { "epoch": 0.2, "learning_rate": 4.848840640702565e-06, "logits/chosen": -0.07128571718931198, "logits/rejected": 0.001640728092752397, "logps/chosen": -411.7811584472656, "logps/rejected": -417.4090270996094, "loss": 0.0453, "rewards/accuracies": 0.6291667222976685, "rewards/chosen": -0.14686502516269684, "rewards/margins": 0.0317012295126915, "rewards/rejected": -0.17856626212596893, "step": 510 }, { "epoch": 0.2, "learning_rate": 4.836885673497435e-06, "logits/chosen": -0.002718993928283453, "logits/rejected": 0.0009809813927859068, "logps/chosen": -444.13201904296875, "logps/rejected": -433.43511962890625, "loss": 0.048, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1496659815311432, "rewards/margins": 0.03328583389520645, "rewards/rejected": -0.18295182287693024, "step": 520 }, { "epoch": 0.21, "learning_rate": 4.824491669228279e-06, "logits/chosen": 0.019337791949510574, "logits/rejected": 0.09313885867595673, "logps/chosen": -413.6084899902344, "logps/rejected": -422.8143005371094, "loss": 0.0336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15184900164604187, "rewards/margins": 0.027125831693410873, "rewards/rejected": -0.17897482216358185, "step": 530 }, { "epoch": 0.21, "learning_rate": 4.811660956390372e-06, "logits/chosen": 0.047072622925043106, "logits/rejected": -0.007110513746738434, "logps/chosen": -463.8948669433594, "logps/rejected": -487.029541015625, "loss": 0.0472, "rewards/accuracies": 0.6708333492279053, "rewards/chosen": -0.1728728711605072, "rewards/margins": 0.03820256516337395, "rewards/rejected": -0.21107542514801025, "step": 540 }, { "epoch": 0.22, "learning_rate": 4.798395945524615e-06, "logits/chosen": 0.038352273404598236, "logits/rejected": 0.03103743866086006, "logps/chosen": -426.7752380371094, "logps/rejected": -435.8916931152344, "loss": 0.0432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17635078728199005, "rewards/margins": 0.03284124284982681, "rewards/rejected": -0.20919200778007507, "step": 550 }, { "epoch": 0.22, "learning_rate": 4.784699128764654e-06, "logits/chosen": 0.03960705175995827, "logits/rejected": 0.08261826634407043, "logps/chosen": -458.94873046875, "logps/rejected": -475.46539306640625, "loss": 0.0379, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": -0.18301726877689362, "rewards/margins": 0.03872082382440567, "rewards/rejected": -0.2217380702495575, "step": 560 }, { "epoch": 0.22, "learning_rate": 4.770573079368691e-06, "logits/chosen": 0.034317515790462494, "logits/rejected": 0.12880605459213257, "logps/chosen": -512.765625, "logps/rejected": -510.3446350097656, "loss": 0.0413, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": -0.20771527290344238, "rewards/margins": 0.03588930517435074, "rewards/rejected": -0.24360458552837372, "step": 570 }, { "epoch": 0.23, "learning_rate": 4.756020451236025e-06, "logits/chosen": -0.017928753048181534, "logits/rejected": 0.12967847287654877, "logps/chosen": -548.6043090820312, "logps/rejected": -566.4415283203125, "loss": 0.0288, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.21577982604503632, "rewards/margins": 0.04475008323788643, "rewards/rejected": -0.26052993535995483, "step": 580 }, { "epoch": 0.23, "learning_rate": 4.741043978408463e-06, "logits/chosen": 0.0831088125705719, "logits/rejected": 0.1524641066789627, "logps/chosen": -465.2798767089844, "logps/rejected": -489.66558837890625, "loss": 0.0535, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18829752504825592, "rewards/margins": 0.055931150913238525, "rewards/rejected": -0.24422867596149445, "step": 590 }, { "epoch": 0.24, "learning_rate": 4.725646474556666e-06, "logits/chosen": 0.13520978391170502, "logits/rejected": 0.03108205832540989, "logps/chosen": -504.00933837890625, "logps/rejected": -523.7846069335938, "loss": 0.0373, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.2136494219303131, "rewards/margins": 0.0509483627974987, "rewards/rejected": -0.26459774374961853, "step": 600 }, { "epoch": 0.24, "learning_rate": 4.709830832451538e-06, "logits/chosen": 0.08839813619852066, "logits/rejected": 0.1621231734752655, "logps/chosen": -543.0064086914062, "logps/rejected": -552.148681640625, "loss": 0.0367, "rewards/accuracies": 0.5750001072883606, "rewards/chosen": -0.21853478252887726, "rewards/margins": 0.02961021102964878, "rewards/rejected": -0.24814501404762268, "step": 610 }, { "epoch": 0.24, "learning_rate": 4.693600023420758e-06, "logits/chosen": 0.12001373618841171, "logits/rejected": 0.15668334066867828, "logps/chosen": -525.3939208984375, "logps/rejected": -518.4581298828125, "loss": 0.035, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.20465150475502014, "rewards/margins": 0.050372164696455, "rewards/rejected": -0.25502365827560425, "step": 620 }, { "epoch": 0.25, "learning_rate": 4.676957096790536e-06, "logits/chosen": 0.03348086029291153, "logits/rejected": 0.053055208176374435, "logps/chosen": -540.022216796875, "logps/rejected": -549.9537963867188, "loss": 0.0447, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.25113242864608765, "rewards/margins": 0.0484883114695549, "rewards/rejected": -0.29962077736854553, "step": 630 }, { "epoch": 0.25, "learning_rate": 4.659905179312743e-06, "logits/chosen": -0.022173499688506126, "logits/rejected": -0.0015158101450651884, "logps/chosen": -499.20263671875, "logps/rejected": -524.68115234375, "loss": 0.0365, "rewards/accuracies": 0.5958333611488342, "rewards/chosen": -0.20415392518043518, "rewards/margins": 0.0474516786634922, "rewards/rejected": -0.2516055703163147, "step": 640 }, { "epoch": 0.26, "learning_rate": 4.642447474577466e-06, "logits/chosen": 0.0385822094976902, "logits/rejected": 0.08059495687484741, "logps/chosen": -467.4093322753906, "logps/rejected": -448.34063720703125, "loss": 0.04, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.18831539154052734, "rewards/margins": 0.029135623946785927, "rewards/rejected": -0.21745100617408752, "step": 650 }, { "epoch": 0.26, "learning_rate": 4.6245872624111535e-06, "logits/chosen": 0.07350125908851624, "logits/rejected": 0.07948148250579834, "logps/chosen": -486.54443359375, "logps/rejected": -482.19207763671875, "loss": 0.0374, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": -0.21189257502555847, "rewards/margins": 0.02745920978486538, "rewards/rejected": -0.2393517792224884, "step": 660 }, { "epoch": 0.26, "learning_rate": 4.606327898260413e-06, "logits/chosen": 0.08668817579746246, "logits/rejected": 0.10542847216129303, "logps/chosen": -524.11181640625, "logps/rejected": -522.0614013671875, "loss": 0.0481, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -0.23068180680274963, "rewards/margins": 0.027634575963020325, "rewards/rejected": -0.25831639766693115, "step": 670 }, { "epoch": 0.27, "learning_rate": 4.587672812561626e-06, "logits/chosen": 0.11421390622854233, "logits/rejected": 0.12782898545265198, "logps/chosen": -481.06671142578125, "logps/rejected": -506.188232421875, "loss": 0.037, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2014632523059845, "rewards/margins": 0.04268348962068558, "rewards/rejected": -0.24414674937725067, "step": 680 }, { "epoch": 0.27, "learning_rate": 4.5686255100964535e-06, "logits/chosen": 0.18164952099323273, "logits/rejected": 0.17569103837013245, "logps/chosen": -491.39691162109375, "logps/rejected": -484.49127197265625, "loss": 0.0323, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -0.2082110345363617, "rewards/margins": 0.022325094789266586, "rewards/rejected": -0.23053613305091858, "step": 690 }, { "epoch": 0.27, "learning_rate": 4.549189569333387e-06, "logits/chosen": 0.1008072942495346, "logits/rejected": 0.10313601791858673, "logps/chosen": -464.47589111328125, "logps/rejected": -479.8761291503906, "loss": 0.0427, "rewards/accuracies": 0.6041666269302368, "rewards/chosen": -0.207403302192688, "rewards/margins": 0.03284333646297455, "rewards/rejected": -0.24024665355682373, "step": 700 }, { "epoch": 0.28, "learning_rate": 4.529368641755453e-06, "logits/chosen": 0.17117619514465332, "logits/rejected": 0.13185498118400574, "logps/chosen": -490.5037536621094, "logps/rejected": -495.96728515625, "loss": 0.037, "rewards/accuracies": 0.60833340883255, "rewards/chosen": -0.1993740200996399, "rewards/margins": 0.03559664636850357, "rewards/rejected": -0.23497064411640167, "step": 710 }, { "epoch": 0.28, "learning_rate": 4.509166451174194e-06, "logits/chosen": 0.0657685250043869, "logits/rejected": 0.2077389508485794, "logps/chosen": -550.5696411132812, "logps/rejected": -528.6010131835938, "loss": 0.038, "rewards/accuracies": 0.625, "rewards/chosen": -0.23079976439476013, "rewards/margins": 0.03827609866857529, "rewards/rejected": -0.269075870513916, "step": 720 }, { "epoch": 0.29, "learning_rate": 4.488586793030075e-06, "logits/chosen": 0.06361114978790283, "logits/rejected": 0.1869657188653946, "logps/chosen": -494.99249267578125, "logps/rejected": -481.3677673339844, "loss": 0.0422, "rewards/accuracies": 0.6041666269302368, "rewards/chosen": -0.2145577371120453, "rewards/margins": 0.040170665830373764, "rewards/rejected": -0.25472837686538696, "step": 730 }, { "epoch": 0.29, "learning_rate": 4.4676335336794125e-06, "logits/chosen": 0.05644664913415909, "logits/rejected": 0.24188189208507538, "logps/chosen": -548.88037109375, "logps/rejected": -532.0226440429688, "loss": 0.0443, "rewards/accuracies": 0.5583333373069763, "rewards/chosen": -0.25053781270980835, "rewards/margins": 0.0365094318985939, "rewards/rejected": -0.28704723715782166, "step": 740 }, { "epoch": 0.29, "learning_rate": 4.446310609668001e-06, "logits/chosen": 0.11400438845157623, "logits/rejected": 0.18557283282279968, "logps/chosen": -490.58465576171875, "logps/rejected": -531.9818725585938, "loss": 0.0383, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.23414695262908936, "rewards/margins": 0.035373009741306305, "rewards/rejected": -0.2695199251174927, "step": 750 }, { "epoch": 0.3, "learning_rate": 4.424622026991536e-06, "logits/chosen": 0.11939887702465057, "logits/rejected": 0.1422724574804306, "logps/chosen": -542.72900390625, "logps/rejected": -549.841796875, "loss": 0.0374, "rewards/accuracies": 0.6208333969116211, "rewards/chosen": -0.24516990780830383, "rewards/margins": 0.0396624319255352, "rewards/rejected": -0.28483232855796814, "step": 760 }, { "epoch": 0.3, "learning_rate": 4.402571860343006e-06, "logits/chosen": 0.1107536181807518, "logits/rejected": 0.09486749023199081, "logps/chosen": -526.3046875, "logps/rejected": -545.7291870117188, "loss": 0.0343, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": -0.23441223800182343, "rewards/margins": 0.04228886589407921, "rewards/rejected": -0.27670109272003174, "step": 770 }, { "epoch": 0.31, "learning_rate": 4.3801642523471585e-06, "logits/chosen": 0.11268027126789093, "logits/rejected": 0.14215223491191864, "logps/chosen": -493.4173889160156, "logps/rejected": -503.95245361328125, "loss": 0.038, "rewards/accuracies": 0.6250000596046448, "rewards/chosen": -0.21670031547546387, "rewards/margins": 0.042834293097257614, "rewards/rejected": -0.259534627199173, "step": 780 }, { "epoch": 0.31, "learning_rate": 4.35740341278222e-06, "logits/chosen": 0.1416054219007492, "logits/rejected": 0.10916705429553986, "logps/chosen": -567.3914794921875, "logps/rejected": -554.6038818359375, "loss": 0.0343, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.24395592510700226, "rewards/margins": 0.042997851967811584, "rewards/rejected": -0.28695374727249146, "step": 790 }, { "epoch": 0.31, "learning_rate": 4.334293617788992e-06, "logits/chosen": 0.05959437042474747, "logits/rejected": 0.09841112047433853, "logps/chosen": -537.2469482421875, "logps/rejected": -533.2855224609375, "loss": 0.0502, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": -0.25295352935791016, "rewards/margins": 0.05385099723935127, "rewards/rejected": -0.30680450797080994, "step": 800 }, { "epoch": 0.32, "learning_rate": 4.310839209067482e-06, "logits/chosen": 0.16151997447013855, "logits/rejected": 0.02669985592365265, "logps/chosen": -547.8688354492188, "logps/rejected": -549.8050537109375, "loss": 0.0386, "rewards/accuracies": 0.625, "rewards/chosen": -0.2529495060443878, "rewards/margins": 0.04387221857905388, "rewards/rejected": -0.2968217432498932, "step": 810 }, { "epoch": 0.32, "learning_rate": 4.2870445930612135e-06, "logits/chosen": 0.09316407889127731, "logits/rejected": 0.13477447628974915, "logps/chosen": -542.4185180664062, "logps/rejected": -559.9661254882812, "loss": 0.0449, "rewards/accuracies": 0.6416666507720947, "rewards/chosen": -0.25684836506843567, "rewards/margins": 0.05672596022486687, "rewards/rejected": -0.31357431411743164, "step": 820 }, { "epoch": 0.33, "learning_rate": 4.262914240129379e-06, "logits/chosen": 0.12965205311775208, "logits/rejected": 0.1867331862449646, "logps/chosen": -539.5619506835938, "logps/rejected": -548.6708984375, "loss": 0.0431, "rewards/accuracies": 0.6208332777023315, "rewards/chosen": -0.23366475105285645, "rewards/margins": 0.05336092785000801, "rewards/rejected": -0.28702566027641296, "step": 830 }, { "epoch": 0.33, "learning_rate": 4.238452683706979e-06, "logits/chosen": 0.1362159699201584, "logits/rejected": 0.1769772171974182, "logps/chosen": -512.2302856445312, "logps/rejected": -537.4754028320312, "loss": 0.0399, "rewards/accuracies": 0.6624999642372131, "rewards/chosen": -0.24229979515075684, "rewards/margins": 0.051566917449235916, "rewards/rejected": -0.29386672377586365, "step": 840 }, { "epoch": 0.33, "learning_rate": 4.213664519453115e-06, "logits/chosen": 0.18831387162208557, "logits/rejected": 0.23091156780719757, "logps/chosen": -525.028564453125, "logps/rejected": -547.172607421875, "loss": 0.0379, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.23653945326805115, "rewards/margins": 0.0342828668653965, "rewards/rejected": -0.27082234621047974, "step": 850 }, { "epoch": 0.34, "learning_rate": 4.188554404387588e-06, "logits/chosen": 0.18981604278087616, "logits/rejected": 0.24236874282360077, "logps/chosen": -510.7457580566406, "logps/rejected": -513.2806396484375, "loss": 0.0438, "rewards/accuracies": 0.5541666746139526, "rewards/chosen": -0.23030021786689758, "rewards/margins": 0.021967504173517227, "rewards/rejected": -0.2522676885128021, "step": 860 }, { "epoch": 0.34, "learning_rate": 4.163127056015975e-06, "logits/chosen": 0.1602737456560135, "logits/rejected": 0.16516181826591492, "logps/chosen": -540.715576171875, "logps/rejected": -552.7288208007812, "loss": 0.0414, "rewards/accuracies": 0.6375001072883606, "rewards/chosen": -0.24130482971668243, "rewards/margins": 0.037548404186964035, "rewards/rejected": -0.27885323762893677, "step": 870 }, { "epoch": 0.35, "learning_rate": 4.137387251443335e-06, "logits/chosen": 0.13992497324943542, "logits/rejected": 0.2219216376543045, "logps/chosen": -486.76177978515625, "logps/rejected": -504.03802490234375, "loss": 0.0368, "rewards/accuracies": 0.6750000715255737, "rewards/chosen": -0.2241494357585907, "rewards/margins": 0.049367643892765045, "rewards/rejected": -0.27351707220077515, "step": 880 }, { "epoch": 0.35, "learning_rate": 4.111339826476725e-06, "logits/chosen": 0.11946271359920502, "logits/rejected": 0.25534653663635254, "logps/chosen": -515.7346801757812, "logps/rejected": -519.0991821289062, "loss": 0.0398, "rewards/accuracies": 0.5708333253860474, "rewards/chosen": -0.2389453649520874, "rewards/margins": 0.034484557807445526, "rewards/rejected": -0.2734299600124359, "step": 890 }, { "epoch": 0.35, "learning_rate": 4.084989674716679e-06, "logits/chosen": 0.21193349361419678, "logits/rejected": 0.1593044102191925, "logps/chosen": -566.4176025390625, "logps/rejected": -586.424072265625, "loss": 0.0372, "rewards/accuracies": 0.6208333969116211, "rewards/chosen": -0.2513963282108307, "rewards/margins": 0.043524421751499176, "rewards/rejected": -0.29492074251174927, "step": 900 }, { "epoch": 0.36, "learning_rate": 4.05834174663784e-06, "logits/chosen": 0.12491929531097412, "logits/rejected": 0.21425482630729675, "logps/chosen": -551.5991821289062, "logps/rejected": -550.2804565429688, "loss": 0.0412, "rewards/accuracies": 0.5958333015441895, "rewards/chosen": -0.259804904460907, "rewards/margins": 0.03690405935049057, "rewards/rejected": -0.29670897126197815, "step": 910 }, { "epoch": 0.36, "learning_rate": 4.031401048658892e-06, "logits/chosen": 0.13918252289295197, "logits/rejected": 0.1187935471534729, "logps/chosen": -546.859130859375, "logps/rejected": -551.1666870117188, "loss": 0.046, "rewards/accuracies": 0.6250000596046448, "rewards/chosen": -0.24925978481769562, "rewards/margins": 0.04101654142141342, "rewards/rejected": -0.29027634859085083, "step": 920 }, { "epoch": 0.37, "learning_rate": 4.004172642202002e-06, "logits/chosen": 0.1344153732061386, "logits/rejected": 0.193574458360672, "logps/chosen": -535.381103515625, "logps/rejected": -538.9880981445312, "loss": 0.0491, "rewards/accuracies": 0.60833340883255, "rewards/chosen": -0.2537057399749756, "rewards/margins": 0.041815683245658875, "rewards/rejected": -0.29552140831947327, "step": 930 }, { "epoch": 0.37, "learning_rate": 3.976661642741908e-06, "logits/chosen": 0.13408444821834564, "logits/rejected": 0.13395357131958008, "logps/chosen": -571.904052734375, "logps/rejected": -573.4716796875, "loss": 0.0366, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": -0.27147650718688965, "rewards/margins": 0.036444105207920074, "rewards/rejected": -0.3079206347465515, "step": 940 }, { "epoch": 0.37, "learning_rate": 3.948873218844863e-06, "logits/chosen": 0.07277832925319672, "logits/rejected": 0.1558527946472168, "logps/chosen": -547.19580078125, "logps/rejected": -566.0330200195312, "loss": 0.0421, "rewards/accuracies": 0.5916666984558105, "rewards/chosen": -0.27663344144821167, "rewards/margins": 0.02979731187224388, "rewards/rejected": -0.30643078684806824, "step": 950 }, { "epoch": 0.38, "learning_rate": 3.920812591197604e-06, "logits/chosen": 0.17433296144008636, "logits/rejected": 0.22166447341442108, "logps/chosen": -513.27783203125, "logps/rejected": -523.3780517578125, "loss": 0.0456, "rewards/accuracies": 0.5708333253860474, "rewards/chosen": -0.25121426582336426, "rewards/margins": 0.043769314885139465, "rewards/rejected": -0.2949835956096649, "step": 960 }, { "epoch": 0.38, "learning_rate": 3.892485031626527e-06, "logits/chosen": 0.10151199996471405, "logits/rejected": 0.2275511473417282, "logps/chosen": -542.8049926757812, "logps/rejected": -553.17724609375, "loss": 0.0392, "rewards/accuracies": 0.6041666865348816, "rewards/chosen": -0.26481813192367554, "rewards/margins": 0.041800495237112045, "rewards/rejected": -0.3066186308860779, "step": 970 }, { "epoch": 0.38, "learning_rate": 3.863895862107255e-06, "logits/chosen": 0.0613768994808197, "logits/rejected": 0.17055214941501617, "logps/chosen": -565.1832885742188, "logps/rejected": -586.520751953125, "loss": 0.0371, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": -0.26884034276008606, "rewards/margins": 0.048948947340250015, "rewards/rejected": -0.31778931617736816, "step": 980 }, { "epoch": 0.39, "learning_rate": 3.835050453764779e-06, "logits/chosen": 0.05239003151655197, "logits/rejected": 0.084544338285923, "logps/chosen": -555.6310424804688, "logps/rejected": -569.8839721679688, "loss": 0.0325, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.27361077070236206, "rewards/margins": 0.05519675463438034, "rewards/rejected": -0.3288075625896454, "step": 990 }, { "epoch": 0.39, "learning_rate": 3.80595422586438e-06, "logits/chosen": 0.04233277589082718, "logits/rejected": 0.19852963089942932, "logps/chosen": -601.9495239257812, "logps/rejected": -589.6444091796875, "loss": 0.0337, "rewards/accuracies": 0.6125000715255737, "rewards/chosen": -0.2874451279640198, "rewards/margins": 0.038885898888111115, "rewards/rejected": -0.3263310492038727, "step": 1000 }, { "epoch": 0.4, "learning_rate": 3.7766126447934857e-06, "logits/chosen": 0.055559318512678146, "logits/rejected": 0.08241190761327744, "logps/chosen": -535.3719482421875, "logps/rejected": -538.2369995117188, "loss": 0.0353, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": -0.27672910690307617, "rewards/margins": 0.02185177244246006, "rewards/rejected": -0.2985808849334717, "step": 1010 }, { "epoch": 0.4, "learning_rate": 3.7470312230346955e-06, "logits/chosen": 0.13304325938224792, "logits/rejected": 0.17154903709888458, "logps/chosen": -644.9939575195312, "logps/rejected": -635.8195190429688, "loss": 0.0361, "rewards/accuracies": 0.6291667222976685, "rewards/chosen": -0.31732743978500366, "rewards/margins": 0.04391757771372795, "rewards/rejected": -0.3612450361251831, "step": 1020 }, { "epoch": 0.4, "learning_rate": 3.717215518130127e-06, "logits/chosen": 0.05652935430407524, "logits/rejected": 0.14982140064239502, "logps/chosen": -623.8362426757812, "logps/rejected": -625.2267456054688, "loss": 0.041, "rewards/accuracies": 0.5458332896232605, "rewards/chosen": -0.3348296284675598, "rewards/margins": 0.03417374938726425, "rewards/rejected": -0.3690033555030823, "step": 1030 }, { "epoch": 0.41, "learning_rate": 3.687171131637314e-06, "logits/chosen": 0.1260356307029724, "logits/rejected": 0.22198334336280823, "logps/chosen": -650.363525390625, "logps/rejected": -636.438232421875, "loss": 0.0392, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3333287835121155, "rewards/margins": 0.04869426414370537, "rewards/rejected": -0.38202303647994995, "step": 1040 }, { "epoch": 0.41, "learning_rate": 3.6569037080768153e-06, "logits/chosen": 0.08464168012142181, "logits/rejected": 0.1217307597398758, "logps/chosen": -594.9779663085938, "logps/rejected": -604.1603393554688, "loss": 0.0422, "rewards/accuracies": 0.6416667103767395, "rewards/chosen": -0.3139379620552063, "rewards/margins": 0.04148873686790466, "rewards/rejected": -0.35542672872543335, "step": 1050 }, { "epoch": 0.42, "learning_rate": 3.6264189338717766e-06, "logits/chosen": 0.12213291972875595, "logits/rejected": 0.16966882348060608, "logps/chosen": -577.3529052734375, "logps/rejected": -577.2765502929688, "loss": 0.0335, "rewards/accuracies": 0.6125000715255737, "rewards/chosen": -0.2956945598125458, "rewards/margins": 0.03605617582798004, "rewards/rejected": -0.3317507207393646, "step": 1060 }, { "epoch": 0.42, "learning_rate": 3.595722536279595e-06, "logits/chosen": 0.11031585931777954, "logits/rejected": 0.15994997322559357, "logps/chosen": -584.4676513671875, "logps/rejected": -583.6405639648438, "loss": 0.0409, "rewards/accuracies": 0.6041666269302368, "rewards/chosen": -0.298969566822052, "rewards/margins": 0.04104112833738327, "rewards/rejected": -0.34001070261001587, "step": 1070 }, { "epoch": 0.42, "learning_rate": 3.5648202823159317e-06, "logits/chosen": 0.08247041702270508, "logits/rejected": 0.11911840736865997, "logps/chosen": -548.5514526367188, "logps/rejected": -581.8680419921875, "loss": 0.0379, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.27981576323509216, "rewards/margins": 0.04667786508798599, "rewards/rejected": -0.32649365067481995, "step": 1080 }, { "epoch": 0.43, "learning_rate": 3.5337179776712427e-06, "logits/chosen": 0.05764445662498474, "logits/rejected": 0.15337909758090973, "logps/chosen": -600.7415771484375, "logps/rejected": -617.4767456054688, "loss": 0.0489, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.29948121309280396, "rewards/margins": 0.03527333587408066, "rewards/rejected": -0.3347545266151428, "step": 1090 }, { "epoch": 0.43, "learning_rate": 3.5024214656200497e-06, "logits/chosen": 0.07227407395839691, "logits/rejected": 0.06218891218304634, "logps/chosen": -563.1790771484375, "logps/rejected": -560.1734619140625, "loss": 0.0368, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.2846832275390625, "rewards/margins": 0.041954539716243744, "rewards/rejected": -0.32663780450820923, "step": 1100 }, { "epoch": 0.44, "learning_rate": 3.4709366259231468e-06, "logits/chosen": 0.0697668045759201, "logits/rejected": 0.24307604134082794, "logps/chosen": -628.77001953125, "logps/rejected": -621.4201049804688, "loss": 0.0358, "rewards/accuracies": 0.6041667461395264, "rewards/chosen": -0.30719098448753357, "rewards/margins": 0.05207127332687378, "rewards/rejected": -0.3592623174190521, "step": 1110 }, { "epoch": 0.44, "learning_rate": 3.439269373722957e-06, "logits/chosen": 0.0506308451294899, "logits/rejected": 0.15233644843101501, "logps/chosen": -559.6640014648438, "logps/rejected": -583.3889770507812, "loss": 0.0399, "rewards/accuracies": 0.6250000596046448, "rewards/chosen": -0.2970438003540039, "rewards/margins": 0.048429377377033234, "rewards/rejected": -0.34547320008277893, "step": 1120 }, { "epoch": 0.44, "learning_rate": 3.4074256584322336e-06, "logits/chosen": 0.1312958151102066, "logits/rejected": 0.09558086097240448, "logps/chosen": -608.6600341796875, "logps/rejected": -633.423095703125, "loss": 0.0393, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": -0.3313175439834595, "rewards/margins": 0.04614431411027908, "rewards/rejected": -0.3774617910385132, "step": 1130 }, { "epoch": 0.45, "learning_rate": 3.375411462616332e-06, "logits/chosen": 0.050155311822891235, "logits/rejected": 0.05635789781808853, "logps/chosen": -635.3673095703125, "logps/rejected": -645.3787231445312, "loss": 0.0328, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.3184329867362976, "rewards/margins": 0.049739837646484375, "rewards/rejected": -0.368172824382782, "step": 1140 }, { "epoch": 0.45, "learning_rate": 3.343232800869247e-06, "logits/chosen": 0.06382627040147781, "logits/rejected": 0.16157305240631104, "logps/chosen": -633.6954345703125, "logps/rejected": -649.1434326171875, "loss": 0.0361, "rewards/accuracies": 0.6916666626930237, "rewards/chosen": -0.32204508781433105, "rewards/margins": 0.053830452263355255, "rewards/rejected": -0.3758755326271057, "step": 1150 }, { "epoch": 0.46, "learning_rate": 3.310895718683635e-06, "logits/chosen": 0.14830520749092102, "logits/rejected": 0.155593141913414, "logps/chosen": -621.237548828125, "logps/rejected": -618.225341796875, "loss": 0.0465, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.30767199397087097, "rewards/margins": 0.04294462502002716, "rewards/rejected": -0.3506166338920593, "step": 1160 }, { "epoch": 0.46, "learning_rate": 3.27840629131503e-06, "logits/chosen": 0.07861081510782242, "logits/rejected": 0.10706281661987305, "logps/chosen": -573.5535888671875, "logps/rejected": -588.395263671875, "loss": 0.0349, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2752663493156433, "rewards/margins": 0.04959608614444733, "rewards/rejected": -0.32486245036125183, "step": 1170 }, { "epoch": 0.46, "learning_rate": 3.2457706226404715e-06, "logits/chosen": 0.07944418489933014, "logits/rejected": 0.1640586107969284, "logps/chosen": -587.9627685546875, "logps/rejected": -604.2811889648438, "loss": 0.0383, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": -0.29255762696266174, "rewards/margins": 0.04707466810941696, "rewards/rejected": -0.3396322429180145, "step": 1180 }, { "epoch": 0.47, "learning_rate": 3.2129948440117487e-06, "logits/chosen": 0.10159436613321304, "logits/rejected": 0.15032300353050232, "logps/chosen": -637.3546142578125, "logps/rejected": -640.6038818359375, "loss": 0.0247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3146159052848816, "rewards/margins": 0.04759891331195831, "rewards/rejected": -0.3622148633003235, "step": 1190 }, { "epoch": 0.47, "learning_rate": 3.1800851131034904e-06, "logits/chosen": 0.11651058495044708, "logits/rejected": 0.10065825283527374, "logps/chosen": -591.7140502929688, "logps/rejected": -598.9595947265625, "loss": 0.0444, "rewards/accuracies": 0.595833420753479, "rewards/chosen": -0.31949153542518616, "rewards/margins": 0.048882994800806046, "rewards/rejected": -0.3683745265007019, "step": 1200 }, { "epoch": 0.47, "learning_rate": 3.147047612756302e-06, "logits/chosen": 0.11179051548242569, "logits/rejected": 0.1353040635585785, "logps/chosen": -612.1154174804688, "logps/rejected": -651.6345825195312, "loss": 0.031, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.3106861412525177, "rewards/margins": 0.052404046058654785, "rewards/rejected": -0.3630901873111725, "step": 1210 }, { "epoch": 0.48, "learning_rate": 3.1138885498151843e-06, "logits/chosen": 0.1667296588420868, "logits/rejected": 0.13556842505931854, "logps/chosen": -565.1221923828125, "logps/rejected": -589.7633056640625, "loss": 0.0313, "rewards/accuracies": 0.6708333492279053, "rewards/chosen": -0.2937767505645752, "rewards/margins": 0.06078268960118294, "rewards/rejected": -0.35455942153930664, "step": 1220 }, { "epoch": 0.48, "learning_rate": 3.0806141539634294e-06, "logits/chosen": 0.10828538239002228, "logits/rejected": 0.15916824340820312, "logps/chosen": -575.6380004882812, "logps/rejected": -566.2510986328125, "loss": 0.0379, "rewards/accuracies": 0.625, "rewards/chosen": -0.2859388291835785, "rewards/margins": 0.05137751251459122, "rewards/rejected": -0.3373163640499115, "step": 1230 }, { "epoch": 0.49, "learning_rate": 3.0472306765522393e-06, "logits/chosen": 0.11922699213027954, "logits/rejected": 0.1259177178144455, "logps/chosen": -606.871826171875, "logps/rejected": -612.5156860351562, "loss": 0.0398, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.30839142203330994, "rewards/margins": 0.03700650483369827, "rewards/rejected": -0.3453979790210724, "step": 1240 }, { "epoch": 0.49, "learning_rate": 3.0137443894262634e-06, "logits/chosen": 0.14493581652641296, "logits/rejected": 0.16140879690647125, "logps/chosen": -594.2196044921875, "logps/rejected": -594.2235717773438, "loss": 0.0355, "rewards/accuracies": 0.6041666865348816, "rewards/chosen": -0.30472633242607117, "rewards/margins": 0.052565477788448334, "rewards/rejected": -0.3572917878627777, "step": 1250 }, { "epoch": 0.49, "learning_rate": 2.980161583745294e-06, "logits/chosen": 0.10105246305465698, "logits/rejected": 0.15723805129528046, "logps/chosen": -620.9005126953125, "logps/rejected": -618.7329711914062, "loss": 0.0375, "rewards/accuracies": 0.6375000476837158, "rewards/chosen": -0.3132612705230713, "rewards/margins": 0.04581070318818092, "rewards/rejected": -0.3590719997882843, "step": 1260 }, { "epoch": 0.5, "learning_rate": 2.946488568802324e-06, "logits/chosen": 0.14716312289237976, "logits/rejected": 0.1699884533882141, "logps/chosen": -577.747314453125, "logps/rejected": -588.8140869140625, "loss": 0.0368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2906225323677063, "rewards/margins": 0.04303520917892456, "rewards/rejected": -0.33365774154663086, "step": 1270 }, { "epoch": 0.5, "learning_rate": 2.912731670838207e-06, "logits/chosen": 0.12559036910533905, "logits/rejected": 0.14857833087444305, "logps/chosen": -573.2417602539062, "logps/rejected": -594.3810424804688, "loss": 0.0389, "rewards/accuracies": 0.6291666626930237, "rewards/chosen": -0.2716652452945709, "rewards/margins": 0.04548722505569458, "rewards/rejected": -0.3171524703502655, "step": 1280 }, { "epoch": 0.51, "learning_rate": 2.8788972318531272e-06, "logits/chosen": 0.13978327810764313, "logits/rejected": 0.15119092166423798, "logps/chosen": -543.723388671875, "logps/rejected": -543.7661743164062, "loss": 0.0406, "rewards/accuracies": 0.6416667103767395, "rewards/chosen": -0.2711487412452698, "rewards/margins": 0.03938648849725723, "rewards/rejected": -0.3105352520942688, "step": 1290 }, { "epoch": 0.51, "learning_rate": 2.844991608415113e-06, "logits/chosen": 0.07532133162021637, "logits/rejected": 0.12284326553344727, "logps/chosen": -540.3726806640625, "logps/rejected": -558.138671875, "loss": 0.0347, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2621740996837616, "rewards/margins": 0.039031222462654114, "rewards/rejected": -0.3012053370475769, "step": 1300 }, { "epoch": 0.51, "learning_rate": 2.8110211704658073e-06, "logits/chosen": 0.06420323997735977, "logits/rejected": 0.11035875231027603, "logps/chosen": -504.5506286621094, "logps/rejected": -545.575439453125, "loss": 0.0341, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.2541031241416931, "rewards/margins": 0.0474877692759037, "rewards/rejected": -0.3015909194946289, "step": 1310 }, { "epoch": 0.52, "learning_rate": 2.776992300123732e-06, "logits/chosen": 0.02886870503425598, "logits/rejected": 0.08844368159770966, "logps/chosen": -561.7490234375, "logps/rejected": -580.228759765625, "loss": 0.0398, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.27741044759750366, "rewards/margins": 0.040386758744716644, "rewards/rejected": -0.3177972435951233, "step": 1320 }, { "epoch": 0.52, "learning_rate": 2.742911390485262e-06, "logits/chosen": 0.07097556442022324, "logits/rejected": 0.12344779819250107, "logps/chosen": -502.0489807128906, "logps/rejected": -517.57861328125, "loss": 0.0385, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2432735413312912, "rewards/margins": 0.028061959892511368, "rewards/rejected": -0.27133551239967346, "step": 1330 }, { "epoch": 0.53, "learning_rate": 2.7087848444235354e-06, "logits/chosen": 0.0665472000837326, "logits/rejected": 0.12706132233142853, "logps/chosen": -494.16497802734375, "logps/rejected": -502.8155212402344, "loss": 0.0408, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.21378079056739807, "rewards/margins": 0.050316452980041504, "rewards/rejected": -0.2640972435474396, "step": 1340 }, { "epoch": 0.53, "learning_rate": 2.674619073385531e-06, "logits/chosen": 0.16853651404380798, "logits/rejected": 0.19799327850341797, "logps/chosen": -517.6676635742188, "logps/rejected": -565.3977661132812, "loss": 0.0398, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2295691967010498, "rewards/margins": 0.06171298772096634, "rewards/rejected": -0.29128220677375793, "step": 1350 }, { "epoch": 0.53, "learning_rate": 2.640420496187528e-06, "logits/chosen": 0.08946164697408676, "logits/rejected": 0.15218599140644073, "logps/chosen": -524.2635498046875, "logps/rejected": -535.5513916015625, "loss": 0.0406, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.23513486981391907, "rewards/margins": 0.049268923699855804, "rewards/rejected": -0.2844037711620331, "step": 1360 }, { "epoch": 0.54, "learning_rate": 2.6061955378091896e-06, "logits/chosen": 0.057071197777986526, "logits/rejected": 0.09025406837463379, "logps/chosen": -501.2740173339844, "logps/rejected": -527.2225341796875, "loss": 0.0333, "rewards/accuracies": 0.6375001072883606, "rewards/chosen": -0.22606611251831055, "rewards/margins": 0.05956178158521652, "rewards/rejected": -0.28562790155410767, "step": 1370 }, { "epoch": 0.54, "learning_rate": 2.5719506281864838e-06, "logits/chosen": 0.09907601028680801, "logits/rejected": 0.11537568271160126, "logps/chosen": -517.09033203125, "logps/rejected": -532.591796875, "loss": 0.0357, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.23992018401622772, "rewards/margins": 0.0427953377366066, "rewards/rejected": -0.2827155292034149, "step": 1380 }, { "epoch": 0.55, "learning_rate": 2.537692201003671e-06, "logits/chosen": 0.0913078784942627, "logits/rejected": 0.0602453276515007, "logps/chosen": -498.9737243652344, "logps/rejected": -548.3724975585938, "loss": 0.0319, "rewards/accuracies": 0.625, "rewards/chosen": -0.2386229932308197, "rewards/margins": 0.045166097581386566, "rewards/rejected": -0.2837890386581421, "step": 1390 }, { "epoch": 0.55, "learning_rate": 2.503426692484594e-06, "logits/chosen": 0.08517016470432281, "logits/rejected": 0.07997085154056549, "logps/chosen": -555.7423095703125, "logps/rejected": -571.2945556640625, "loss": 0.0332, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": -0.27391716837882996, "rewards/margins": 0.04417312890291214, "rewards/rejected": -0.3180902600288391, "step": 1400 }, { "epoch": 0.55, "learning_rate": 2.4691605401834843e-06, "logits/chosen": 0.15011510252952576, "logits/rejected": 0.2677749693393707, "logps/chosen": -542.283935546875, "logps/rejected": -584.0543212890625, "loss": 0.0402, "rewards/accuracies": 0.6416667103767395, "rewards/chosen": -0.2576281428337097, "rewards/margins": 0.05184938758611679, "rewards/rejected": -0.3094775080680847, "step": 1410 }, { "epoch": 0.56, "learning_rate": 2.434900181775524e-06, "logits/chosen": 0.07792448252439499, "logits/rejected": 0.06877502799034119, "logps/chosen": -571.9642944335938, "logps/rejected": -567.0635375976562, "loss": 0.0411, "rewards/accuracies": 0.6541666388511658, "rewards/chosen": -0.24740977585315704, "rewards/margins": 0.047786761075258255, "rewards/rejected": -0.2951965034008026, "step": 1420 }, { "epoch": 0.56, "learning_rate": 2.40065205384738e-06, "logits/chosen": 0.1122296079993248, "logits/rejected": 0.15267851948738098, "logps/chosen": -507.0908203125, "logps/rejected": -531.3499755859375, "loss": 0.032, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.23170237243175507, "rewards/margins": 0.04316466301679611, "rewards/rejected": -0.2748670279979706, "step": 1430 }, { "epoch": 0.57, "learning_rate": 2.3664225906879452e-06, "logits/chosen": 0.13994117081165314, "logits/rejected": 0.2027665674686432, "logps/chosen": -490.51116943359375, "logps/rejected": -501.07916259765625, "loss": 0.0379, "rewards/accuracies": 0.5916666388511658, "rewards/chosen": -0.2336377650499344, "rewards/margins": 0.03639250993728638, "rewards/rejected": -0.27003028988838196, "step": 1440 }, { "epoch": 0.57, "learning_rate": 2.3322182230795127e-06, "logits/chosen": 0.13082632422447205, "logits/rejected": 0.10433439910411835, "logps/chosen": -514.8374633789062, "logps/rejected": -540.4562377929688, "loss": 0.0465, "rewards/accuracies": 0.6125000715255737, "rewards/chosen": -0.23975805938243866, "rewards/margins": 0.04342503473162651, "rewards/rejected": -0.28318309783935547, "step": 1450 }, { "epoch": 0.57, "learning_rate": 2.298045377089604e-06, "logits/chosen": 0.10040481388568878, "logits/rejected": 0.2444113940000534, "logps/chosen": -543.1686401367188, "logps/rejected": -562.5289916992188, "loss": 0.0356, "rewards/accuracies": 0.6541666984558105, "rewards/chosen": -0.25354671478271484, "rewards/margins": 0.06049323081970215, "rewards/rejected": -0.3140399158000946, "step": 1460 }, { "epoch": 0.58, "learning_rate": 2.2639104728636915e-06, "logits/chosen": 0.17033420503139496, "logits/rejected": 0.07545267045497894, "logps/chosen": -586.39599609375, "logps/rejected": -605.8544921875, "loss": 0.0317, "rewards/accuracies": 0.5791666507720947, "rewards/chosen": -0.2881447374820709, "rewards/margins": 0.046030301600694656, "rewards/rejected": -0.3341750502586365, "step": 1470 }, { "epoch": 0.58, "learning_rate": 2.2298199234190236e-06, "logits/chosen": 0.11797505617141724, "logits/rejected": 0.13803385198116302, "logps/chosen": -584.68701171875, "logps/rejected": -591.5270385742188, "loss": 0.04, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.2901507019996643, "rewards/margins": 0.05170051008462906, "rewards/rejected": -0.3418511748313904, "step": 1480 }, { "epoch": 0.58, "learning_rate": 2.195780133439794e-06, "logits/chosen": 0.16074219346046448, "logits/rejected": 0.1192971020936966, "logps/chosen": -555.1654052734375, "logps/rejected": -564.3656005859375, "loss": 0.0372, "rewards/accuracies": 0.6041666865348816, "rewards/chosen": -0.26911431550979614, "rewards/margins": 0.048468612134456635, "rewards/rejected": -0.317582905292511, "step": 1490 }, { "epoch": 0.59, "learning_rate": 2.1617974980738814e-06, "logits/chosen": 0.1529623568058014, "logits/rejected": 0.1636894941329956, "logps/chosen": -540.774169921875, "logps/rejected": -556.2821655273438, "loss": 0.0372, "rewards/accuracies": 0.595833420753479, "rewards/chosen": -0.2688151001930237, "rewards/margins": 0.046817291527986526, "rewards/rejected": -0.3156324028968811, "step": 1500 }, { "epoch": 0.59, "learning_rate": 2.1278784017313688e-06, "logits/chosen": 0.14876937866210938, "logits/rejected": 0.17081575095653534, "logps/chosen": -536.3919067382812, "logps/rejected": -525.305419921875, "loss": 0.0416, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": -0.25508958101272583, "rewards/margins": 0.0361848846077919, "rewards/rejected": -0.29127445816993713, "step": 1510 }, { "epoch": 0.6, "learning_rate": 2.0940292168850913e-06, "logits/chosen": 0.11073969304561615, "logits/rejected": 0.1865236610174179, "logps/chosen": -536.0477905273438, "logps/rejected": -549.0618896484375, "loss": 0.0291, "rewards/accuracies": 0.5624999403953552, "rewards/chosen": -0.27074000239372253, "rewards/margins": 0.03495349735021591, "rewards/rejected": -0.30569347739219666, "step": 1520 }, { "epoch": 0.6, "learning_rate": 2.060256302873421e-06, "logits/chosen": 0.11048384755849838, "logits/rejected": 0.13183829188346863, "logps/chosen": -535.2600708007812, "logps/rejected": -593.9940795898438, "loss": 0.0273, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.26926225423812866, "rewards/margins": 0.049711745232343674, "rewards/rejected": -0.31897395849227905, "step": 1530 }, { "epoch": 0.6, "learning_rate": 2.02656600470552e-06, "logits/chosen": 0.12331001460552216, "logits/rejected": 0.19340971112251282, "logps/chosen": -577.4415283203125, "logps/rejected": -596.8298950195312, "loss": 0.0432, "rewards/accuracies": 0.6166667342185974, "rewards/chosen": -0.28572091460227966, "rewards/margins": 0.04659656435251236, "rewards/rejected": -0.3323175013065338, "step": 1540 }, { "epoch": 0.61, "learning_rate": 1.99296465186929e-06, "logits/chosen": 0.10413268953561783, "logits/rejected": 0.23993226885795593, "logps/chosen": -587.4747314453125, "logps/rejected": -588.8648071289062, "loss": 0.0374, "rewards/accuracies": 0.5750000476837158, "rewards/chosen": -0.29135701060295105, "rewards/margins": 0.03427191823720932, "rewards/rejected": -0.32562893629074097, "step": 1550 }, { "epoch": 0.61, "learning_rate": 1.959458557142228e-06, "logits/chosen": 0.12116161733865738, "logits/rejected": 0.14634718000888824, "logps/chosen": -562.8546142578125, "logps/rejected": -598.8267822265625, "loss": 0.0346, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": -0.2859250605106354, "rewards/margins": 0.04271562397480011, "rewards/rejected": -0.3286406993865967, "step": 1560 }, { "epoch": 0.62, "learning_rate": 1.9260540154054317e-06, "logits/chosen": 0.10976073890924454, "logits/rejected": 0.23895160853862762, "logps/chosen": -553.7778930664062, "logps/rejected": -583.1566772460938, "loss": 0.0354, "rewards/accuracies": 0.6708333492279053, "rewards/chosen": -0.28700724244117737, "rewards/margins": 0.04391475021839142, "rewards/rejected": -0.3309219777584076, "step": 1570 }, { "epoch": 0.62, "learning_rate": 1.8927573024609666e-06, "logits/chosen": 0.18281932175159454, "logits/rejected": 0.12242082506418228, "logps/chosen": -515.3033447265625, "logps/rejected": -552.0582885742188, "loss": 0.0435, "rewards/accuracies": 0.5750000476837158, "rewards/chosen": -0.2664439380168915, "rewards/margins": 0.04643597453832626, "rewards/rejected": -0.3128799498081207, "step": 1580 }, { "epoch": 0.62, "learning_rate": 1.8595746738528045e-06, "logits/chosen": 0.08008557558059692, "logits/rejected": 0.21306416392326355, "logps/chosen": -580.7192993164062, "logps/rejected": -610.1173095703125, "loss": 0.0355, "rewards/accuracies": 0.625, "rewards/chosen": -0.28699517250061035, "rewards/margins": 0.048240162432193756, "rewards/rejected": -0.3352353870868683, "step": 1590 }, { "epoch": 0.63, "learning_rate": 1.826512363691586e-06, "logits/chosen": 0.0883159190416336, "logits/rejected": 0.19013457000255585, "logps/chosen": -533.0752563476562, "logps/rejected": -550.2881469726562, "loss": 0.0375, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.27697938680648804, "rewards/margins": 0.04447559267282486, "rewards/rejected": -0.3214550018310547, "step": 1600 }, { "epoch": 0.63, "learning_rate": 1.7935765834833966e-06, "logits/chosen": 0.11177249252796173, "logits/rejected": 0.15451344847679138, "logps/chosen": -579.9695434570312, "logps/rejected": -609.8052368164062, "loss": 0.0407, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": -0.298335462808609, "rewards/margins": 0.053702156990766525, "rewards/rejected": -0.35203760862350464, "step": 1610 }, { "epoch": 0.64, "learning_rate": 1.7607735209627953e-06, "logits/chosen": 0.07564587891101837, "logits/rejected": 0.1478845179080963, "logps/chosen": -583.5760498046875, "logps/rejected": -615.6829223632812, "loss": 0.0415, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3110070824623108, "rewards/margins": 0.060508888214826584, "rewards/rejected": -0.3715159296989441, "step": 1620 }, { "epoch": 0.64, "learning_rate": 1.7281093389303105e-06, "logits/chosen": 0.08597441017627716, "logits/rejected": 0.24169404804706573, "logps/chosen": -596.5226440429688, "logps/rejected": -639.04052734375, "loss": 0.0301, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.3174855411052704, "rewards/margins": 0.05773097276687622, "rewards/rejected": -0.375216543674469, "step": 1630 }, { "epoch": 0.64, "learning_rate": 1.6955901740946136e-06, "logits/chosen": 0.12801986932754517, "logits/rejected": 0.15590648353099823, "logps/chosen": -595.8865966796875, "logps/rejected": -616.6199951171875, "loss": 0.0265, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": -0.30426111817359924, "rewards/margins": 0.04693468287587166, "rewards/rejected": -0.3511958420276642, "step": 1640 }, { "epoch": 0.65, "learning_rate": 1.663222135919601e-06, "logits/chosen": 0.13937242329120636, "logits/rejected": 0.1421765387058258, "logps/chosen": -597.5296020507812, "logps/rejected": -603.4146728515625, "loss": 0.042, "rewards/accuracies": 0.6583333611488342, "rewards/chosen": -0.29278069734573364, "rewards/margins": 0.052191488444805145, "rewards/rejected": -0.3449721336364746, "step": 1650 }, { "epoch": 0.65, "learning_rate": 1.6310113054765947e-06, "logits/chosen": 0.09593503177165985, "logits/rejected": 0.22683358192443848, "logps/chosen": -555.0337524414062, "logps/rejected": -579.9691772460938, "loss": 0.0433, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": -0.28781309723854065, "rewards/margins": 0.05301021412014961, "rewards/rejected": -0.3408232629299164, "step": 1660 }, { "epoch": 0.66, "learning_rate": 1.5989637343018705e-06, "logits/chosen": 0.1495979279279709, "logits/rejected": 0.20952686667442322, "logps/chosen": -569.4849853515625, "logps/rejected": -616.1697387695312, "loss": 0.0479, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.30492356419563293, "rewards/margins": 0.059173502027988434, "rewards/rejected": -0.36409705877304077, "step": 1670 }, { "epoch": 0.66, "learning_rate": 1.5670854432597433e-06, "logits/chosen": 0.1326146423816681, "logits/rejected": 0.20724856853485107, "logps/chosen": -584.0203247070312, "logps/rejected": -590.6256103515625, "loss": 0.0353, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.28019410371780396, "rewards/margins": 0.04888462647795677, "rewards/rejected": -0.329078733921051, "step": 1680 }, { "epoch": 0.66, "learning_rate": 1.5353824214114075e-06, "logits/chosen": 0.1035301685333252, "logits/rejected": 0.16259366273880005, "logps/chosen": -548.8323364257812, "logps/rejected": -565.8499755859375, "loss": 0.0414, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2619378864765167, "rewards/margins": 0.03961848467588425, "rewards/rejected": -0.3015563189983368, "step": 1690 }, { "epoch": 0.67, "learning_rate": 1.5038606248897586e-06, "logits/chosen": 0.06201974302530289, "logits/rejected": 0.09014655649662018, "logps/chosen": -521.3331909179688, "logps/rejected": -544.2221069335938, "loss": 0.0389, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.2565329074859619, "rewards/margins": 0.04992014169692993, "rewards/rejected": -0.30645304918289185, "step": 1700 }, { "epoch": 0.67, "learning_rate": 1.4725259757803983e-06, "logits/chosen": 0.16263772547245026, "logits/rejected": 0.13033676147460938, "logps/chosen": -553.0350341796875, "logps/rejected": -569.7715454101562, "loss": 0.0452, "rewards/accuracies": 0.6125000715255737, "rewards/chosen": -0.2759607434272766, "rewards/margins": 0.05112532526254654, "rewards/rejected": -0.32708609104156494, "step": 1710 }, { "epoch": 0.68, "learning_rate": 1.4413843610090342e-06, "logits/chosen": 0.1796967089176178, "logits/rejected": 0.1423921287059784, "logps/chosen": -598.36181640625, "logps/rejected": -588.3529052734375, "loss": 0.0359, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.28392156958580017, "rewards/margins": 0.06371378898620605, "rewards/rejected": -0.347635418176651, "step": 1720 }, { "epoch": 0.68, "learning_rate": 1.410441631235487e-06, "logits/chosen": 0.12067948281764984, "logits/rejected": 0.18646660447120667, "logps/chosen": -594.8400268554688, "logps/rejected": -634.2889404296875, "loss": 0.0397, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.30511975288391113, "rewards/margins": 0.045869968831539154, "rewards/rejected": -0.3509897291660309, "step": 1730 }, { "epoch": 0.68, "learning_rate": 1.3797035997545144e-06, "logits/chosen": 0.044562358409166336, "logits/rejected": 0.04120717570185661, "logps/chosen": -540.0699462890625, "logps/rejected": -543.9365234375, "loss": 0.0289, "rewards/accuracies": 0.5916666984558105, "rewards/chosen": -0.2733212113380432, "rewards/margins": 0.04933062568306923, "rewards/rejected": -0.32265186309814453, "step": 1740 }, { "epoch": 0.69, "learning_rate": 1.3491760414036478e-06, "logits/chosen": 0.11400707066059113, "logits/rejected": 0.09495958685874939, "logps/chosen": -565.5853271484375, "logps/rejected": -576.4033203125, "loss": 0.0305, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": -0.2805452346801758, "rewards/margins": 0.04857431724667549, "rewards/rejected": -0.32911956310272217, "step": 1750 }, { "epoch": 0.69, "learning_rate": 1.3188646914782616e-06, "logits/chosen": 0.11315472424030304, "logits/rejected": 0.208398699760437, "logps/chosen": -598.4920043945312, "logps/rejected": -589.9212036132812, "loss": 0.0348, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": -0.2833411991596222, "rewards/margins": 0.04889676719903946, "rewards/rejected": -0.33223795890808105, "step": 1760 }, { "epoch": 0.69, "learning_rate": 1.288775244654062e-06, "logits/chosen": 0.08946464955806732, "logits/rejected": 0.154561847448349, "logps/chosen": -599.8006591796875, "logps/rejected": -594.125732421875, "loss": 0.0348, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.28097003698349, "rewards/margins": 0.04311807453632355, "rewards/rejected": -0.32408809661865234, "step": 1770 }, { "epoch": 0.7, "learning_rate": 1.2589133539172193e-06, "logits/chosen": 0.12887665629386902, "logits/rejected": 0.2141556292772293, "logps/chosen": -576.6539916992188, "logps/rejected": -609.6483764648438, "loss": 0.0336, "rewards/accuracies": 0.625, "rewards/chosen": -0.2800661623477936, "rewards/margins": 0.057719629257917404, "rewards/rejected": -0.3377857804298401, "step": 1780 }, { "epoch": 0.7, "learning_rate": 1.2292846295023222e-06, "logits/chosen": 0.10555066913366318, "logits/rejected": 0.24661192297935486, "logps/chosen": -565.2025146484375, "logps/rejected": -572.82666015625, "loss": 0.03, "rewards/accuracies": 0.5291666984558105, "rewards/chosen": -0.2735806107521057, "rewards/margins": 0.03343730419874191, "rewards/rejected": -0.30701789259910583, "step": 1790 }, { "epoch": 0.71, "learning_rate": 1.19989463783837e-06, "logits/chosen": 0.07563383877277374, "logits/rejected": 0.14022833108901978, "logps/chosen": -563.0352783203125, "logps/rejected": -582.5489501953125, "loss": 0.0416, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.26767465472221375, "rewards/margins": 0.049182355403900146, "rewards/rejected": -0.3168570399284363, "step": 1800 }, { "epoch": 0.71, "learning_rate": 1.1707489005029877e-06, "logits/chosen": 0.10331223905086517, "logits/rejected": 0.10989202558994293, "logps/chosen": -566.6705932617188, "logps/rejected": -583.3214111328125, "loss": 0.0434, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.279958575963974, "rewards/margins": 0.05120917409658432, "rewards/rejected": -0.3311677575111389, "step": 1810 }, { "epoch": 0.71, "learning_rate": 1.1418528931850781e-06, "logits/chosen": 0.13162486255168915, "logits/rejected": 0.09169518947601318, "logps/chosen": -571.5591430664062, "logps/rejected": -575.8829956054688, "loss": 0.0449, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.28504106402397156, "rewards/margins": 0.039923541247844696, "rewards/rejected": -0.32496461272239685, "step": 1820 }, { "epoch": 0.72, "learning_rate": 1.113212044656087e-06, "logits/chosen": 0.11077318340539932, "logits/rejected": 0.14336992800235748, "logps/chosen": -573.6709594726562, "logps/rejected": -595.0328979492188, "loss": 0.0427, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": -0.2933489680290222, "rewards/margins": 0.0513363853096962, "rewards/rejected": -0.3446853756904602, "step": 1830 }, { "epoch": 0.72, "learning_rate": 1.0848317357500854e-06, "logits/chosen": 0.10393325984477997, "logits/rejected": 0.1483684927225113, "logps/chosen": -568.5980834960938, "logps/rejected": -586.1634521484375, "loss": 0.036, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.28037458658218384, "rewards/margins": 0.05212429165840149, "rewards/rejected": -0.33249884843826294, "step": 1840 }, { "epoch": 0.73, "learning_rate": 1.0567172983528534e-06, "logits/chosen": 0.026229266077280045, "logits/rejected": 0.13465450704097748, "logps/chosen": -508.6505432128906, "logps/rejected": -546.9326782226562, "loss": 0.0456, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -0.27017685770988464, "rewards/margins": 0.041460223495960236, "rewards/rejected": -0.31163710355758667, "step": 1850 }, { "epoch": 0.73, "learning_rate": 1.0288740144001722e-06, "logits/chosen": 0.09365380555391312, "logits/rejected": 0.0902700275182724, "logps/chosen": -572.3986206054688, "logps/rejected": -586.7833251953125, "loss": 0.0342, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.26681095361709595, "rewards/margins": 0.04616251215338707, "rewards/rejected": -0.3129734396934509, "step": 1860 }, { "epoch": 0.73, "learning_rate": 1.0013071148854861e-06, "logits/chosen": 0.10842391103506088, "logits/rejected": 0.10454890877008438, "logps/chosen": -538.0030517578125, "logps/rejected": -558.1130981445312, "loss": 0.0364, "rewards/accuracies": 0.6250000596046448, "rewards/chosen": -0.26716285943984985, "rewards/margins": 0.05413733050227165, "rewards/rejected": -0.321300208568573, "step": 1870 }, { "epoch": 0.74, "learning_rate": 9.740217788771453e-07, "logits/chosen": 0.08302603662014008, "logits/rejected": 0.18150724470615387, "logps/chosen": -511.07769775390625, "logps/rejected": -545.5850219726562, "loss": 0.0413, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.24858680367469788, "rewards/margins": 0.04247165471315384, "rewards/rejected": -0.2910584807395935, "step": 1880 }, { "epoch": 0.74, "learning_rate": 9.470231325453958e-07, "logits/chosen": 0.12578806281089783, "logits/rejected": 0.231987863779068, "logps/chosen": -514.3307495117188, "logps/rejected": -531.3946533203125, "loss": 0.0429, "rewards/accuracies": 0.6041666865348816, "rewards/chosen": -0.24261601269245148, "rewards/margins": 0.05155906826257706, "rewards/rejected": -0.29417508840560913, "step": 1890 }, { "epoch": 0.75, "learning_rate": 9.203162481993175e-07, "logits/chosen": 0.16151490807533264, "logits/rejected": 0.17980621755123138, "logps/chosen": -496.478515625, "logps/rejected": -538.2314453125, "loss": 0.0358, "rewards/accuracies": 0.6250000596046448, "rewards/chosen": -0.2318914234638214, "rewards/margins": 0.05293119698762894, "rewards/rejected": -0.28482261300086975, "step": 1900 }, { "epoch": 0.75, "learning_rate": 8.939061433338722e-07, "logits/chosen": 0.09906987845897675, "logits/rejected": 0.2187938690185547, "logps/chosen": -526.57275390625, "logps/rejected": -556.8406372070312, "loss": 0.0398, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.25222307443618774, "rewards/margins": 0.050073761492967606, "rewards/rejected": -0.30229681730270386, "step": 1910 }, { "epoch": 0.75, "learning_rate": 8.677977796872541e-07, "logits/chosen": 0.15915422141551971, "logits/rejected": 0.11833087354898453, "logps/chosen": -529.197021484375, "logps/rejected": -534.5230712890625, "loss": 0.0419, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.25595852732658386, "rewards/margins": 0.04330848529934883, "rewards/rejected": -0.299267053604126, "step": 1920 }, { "epoch": 0.76, "learning_rate": 8.419960623087129e-07, "logits/chosen": 0.20990967750549316, "logits/rejected": 0.13641339540481567, "logps/chosen": -507.27008056640625, "logps/rejected": -555.5889892578125, "loss": 0.046, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.24782676994800568, "rewards/margins": 0.04908429831266403, "rewards/rejected": -0.2969110608100891, "step": 1930 }, { "epoch": 0.76, "learning_rate": 8.165058386370314e-07, "logits/chosen": 0.04395260289311409, "logits/rejected": 0.13358573615550995, "logps/chosen": -534.5526733398438, "logps/rejected": -559.8387451171875, "loss": 0.0401, "rewards/accuracies": 0.625, "rewards/chosen": -0.25756314396858215, "rewards/margins": 0.04230283945798874, "rewards/rejected": -0.2998659908771515, "step": 1940 }, { "epoch": 0.77, "learning_rate": 7.913318975898238e-07, "logits/chosen": 0.11071997880935669, "logits/rejected": 0.12993398308753967, "logps/chosen": -565.89794921875, "logps/rejected": -564.6920166015625, "loss": 0.0335, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -0.2614702880382538, "rewards/margins": 0.04234743118286133, "rewards/rejected": -0.3038177192211151, "step": 1950 }, { "epoch": 0.77, "learning_rate": 7.664789686638272e-07, "logits/chosen": 0.09845000505447388, "logits/rejected": 0.16275520622730255, "logps/chosen": -514.97119140625, "logps/rejected": -554.59423828125, "loss": 0.0415, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2425486147403717, "rewards/margins": 0.05070207640528679, "rewards/rejected": -0.29325070977211, "step": 1960 }, { "epoch": 0.77, "learning_rate": 7.41951721046357e-07, "logits/chosen": 0.09489642083644867, "logits/rejected": 0.11213739961385727, "logps/chosen": -586.5117797851562, "logps/rejected": -600.290283203125, "loss": 0.0358, "rewards/accuracies": 0.6083332896232605, "rewards/chosen": -0.27559903264045715, "rewards/margins": 0.047925811260938644, "rewards/rejected": -0.3235248625278473, "step": 1970 }, { "epoch": 0.78, "learning_rate": 7.177547627380987e-07, "logits/chosen": 0.08067229390144348, "logits/rejected": 0.1412743777036667, "logps/chosen": -555.6590576171875, "logps/rejected": -576.60791015625, "loss": 0.0507, "rewards/accuracies": 0.595833420753479, "rewards/chosen": -0.27716270089149475, "rewards/margins": 0.04384743049740791, "rewards/rejected": -0.32101011276245117, "step": 1980 }, { "epoch": 0.78, "learning_rate": 6.93892639687386e-07, "logits/chosen": 0.09003408253192902, "logits/rejected": 0.1542806327342987, "logps/chosen": -576.6299438476562, "logps/rejected": -576.0133666992188, "loss": 0.0341, "rewards/accuracies": 0.5750000476837158, "rewards/chosen": -0.27841562032699585, "rewards/margins": 0.045767974108457565, "rewards/rejected": -0.3241836130619049, "step": 1990 }, { "epoch": 0.79, "learning_rate": 6.703698349361437e-07, "logits/chosen": 0.10635080188512802, "logits/rejected": 0.05730734393000603, "logps/chosen": -577.123779296875, "logps/rejected": -601.00390625, "loss": 0.0361, "rewards/accuracies": 0.625, "rewards/chosen": -0.28797826170921326, "rewards/margins": 0.05532165244221687, "rewards/rejected": -0.34329989552497864, "step": 2000 }, { "epoch": 0.79, "learning_rate": 6.471907677776426e-07, "logits/chosen": 0.15324482321739197, "logits/rejected": 0.06719908863306046, "logps/chosen": -561.4310302734375, "logps/rejected": -571.0897216796875, "loss": 0.032, "rewards/accuracies": 0.625, "rewards/chosen": -0.2713034451007843, "rewards/margins": 0.058576516807079315, "rewards/rejected": -0.3298799395561218, "step": 2010 }, { "epoch": 0.79, "learning_rate": 6.243597929262404e-07, "logits/chosen": 0.06141387298703194, "logits/rejected": 0.14357991516590118, "logps/chosen": -573.6912841796875, "logps/rejected": -616.2233276367188, "loss": 0.0404, "rewards/accuracies": 0.6041666865348816, "rewards/chosen": -0.28281545639038086, "rewards/margins": 0.05899351090192795, "rewards/rejected": -0.3418089747428894, "step": 2020 }, { "epoch": 0.8, "learning_rate": 6.018811996992455e-07, "logits/chosen": 0.07172363996505737, "logits/rejected": 0.12844975292682648, "logps/chosen": -600.66455078125, "logps/rejected": -590.8489990234375, "loss": 0.0311, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.27809762954711914, "rewards/margins": 0.05610889941453934, "rewards/rejected": -0.3342065215110779, "step": 2030 }, { "epoch": 0.8, "learning_rate": 5.797592112110734e-07, "logits/chosen": 0.06431909650564194, "logits/rejected": 0.09170184284448624, "logps/chosen": -547.0779418945312, "logps/rejected": -577.0574340820312, "loss": 0.0331, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.2835572361946106, "rewards/margins": 0.051959216594696045, "rewards/rejected": -0.33551645278930664, "step": 2040 }, { "epoch": 0.8, "learning_rate": 5.579979835798361e-07, "logits/chosen": 0.0938582718372345, "logits/rejected": 0.1470281332731247, "logps/chosen": -540.1471557617188, "logps/rejected": -592.1015014648438, "loss": 0.041, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2827063202857971, "rewards/margins": 0.060177069157361984, "rewards/rejected": -0.3428834080696106, "step": 2050 }, { "epoch": 0.81, "learning_rate": 5.366016051465245e-07, "logits/chosen": 0.0799831822514534, "logits/rejected": 0.12903760373592377, "logps/chosen": -581.998779296875, "logps/rejected": -606.8611450195312, "loss": 0.0291, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2922040522098541, "rewards/margins": 0.05147022008895874, "rewards/rejected": -0.3436742424964905, "step": 2060 }, { "epoch": 0.81, "learning_rate": 5.155740957069186e-07, "logits/chosen": 0.10282020270824432, "logits/rejected": 0.14179909229278564, "logps/chosen": -559.0867919921875, "logps/rejected": -587.3806762695312, "loss": 0.0428, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": -0.27100348472595215, "rewards/margins": 0.05496737360954285, "rewards/rejected": -0.325970858335495, "step": 2070 }, { "epoch": 0.82, "learning_rate": 4.949194057563783e-07, "logits/chosen": 0.10007326304912567, "logits/rejected": 0.15941056609153748, "logps/chosen": -560.44140625, "logps/rejected": -567.3951416015625, "loss": 0.0402, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.26652681827545166, "rewards/margins": 0.050540946424007416, "rewards/rejected": -0.3170677125453949, "step": 2080 }, { "epoch": 0.82, "learning_rate": 4.746414157476506e-07, "logits/chosen": 0.1249409168958664, "logits/rejected": 0.054917313158512115, "logps/chosen": -530.5921630859375, "logps/rejected": -524.3117065429688, "loss": 0.0358, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": -0.2425779104232788, "rewards/margins": 0.05763505771756172, "rewards/rejected": -0.30021294951438904, "step": 2090 }, { "epoch": 0.82, "learning_rate": 4.5474393536184214e-07, "logits/chosen": 0.07765215635299683, "logits/rejected": 0.10989616811275482, "logps/chosen": -548.0401611328125, "logps/rejected": -552.0130004882812, "loss": 0.0343, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.26287007331848145, "rewards/margins": 0.05279888957738876, "rewards/rejected": -0.31566891074180603, "step": 2100 }, { "epoch": 0.83, "learning_rate": 4.352307027926828e-07, "logits/chosen": 0.06283166259527206, "logits/rejected": 0.21911051869392395, "logps/chosen": -560.373779296875, "logps/rejected": -582.3746337890625, "loss": 0.0347, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.27789947390556335, "rewards/margins": 0.05053368955850601, "rewards/rejected": -0.32843321561813354, "step": 2110 }, { "epoch": 0.83, "learning_rate": 4.1610538404421837e-07, "logits/chosen": 0.021615978330373764, "logits/rejected": 0.13444559276103973, "logps/chosen": -549.7747802734375, "logps/rejected": -571.6242065429688, "loss": 0.0428, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.27358126640319824, "rewards/margins": 0.04225178807973862, "rewards/rejected": -0.31583306193351746, "step": 2120 }, { "epoch": 0.84, "learning_rate": 3.9737157224207265e-07, "logits/chosen": 0.12500056624412537, "logits/rejected": 0.20162144303321838, "logps/chosen": -572.5682983398438, "logps/rejected": -572.8493041992188, "loss": 0.0435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.27676117420196533, "rewards/margins": 0.04713458567857742, "rewards/rejected": -0.32389578223228455, "step": 2130 }, { "epoch": 0.84, "learning_rate": 3.7903278695839456e-07, "logits/chosen": 0.06253985315561295, "logits/rejected": 0.14370861649513245, "logps/chosen": -567.3818359375, "logps/rejected": -567.6959838867188, "loss": 0.0463, "rewards/accuracies": 0.5541666746139526, "rewards/chosen": -0.2864064574241638, "rewards/margins": 0.04097602888941765, "rewards/rejected": -0.32738247513771057, "step": 2140 }, { "epoch": 0.84, "learning_rate": 3.610924735506274e-07, "logits/chosen": 0.10091836750507355, "logits/rejected": 0.1071823462843895, "logps/chosen": -560.47705078125, "logps/rejected": -581.1371459960938, "loss": 0.0352, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.2785682678222656, "rewards/margins": 0.050090573728084564, "rewards/rejected": -0.3286588191986084, "step": 2150 }, { "epoch": 0.85, "learning_rate": 3.4355400251421977e-07, "logits/chosen": 0.04096272587776184, "logits/rejected": 0.25352293252944946, "logps/chosen": -542.7060546875, "logps/rejected": -555.5399169921875, "loss": 0.0376, "rewards/accuracies": 0.5625000596046448, "rewards/chosen": -0.273470014333725, "rewards/margins": 0.03956315666437149, "rewards/rejected": -0.31303319334983826, "step": 2160 }, { "epoch": 0.85, "learning_rate": 3.2642066884940064e-07, "logits/chosen": 0.11667946726083755, "logits/rejected": 0.10625074058771133, "logps/chosen": -543.03564453125, "logps/rejected": -578.9127197265625, "loss": 0.0342, "rewards/accuracies": 0.6083332896232605, "rewards/chosen": -0.2704079747200012, "rewards/margins": 0.05524685978889465, "rewards/rejected": -0.32565486431121826, "step": 2170 }, { "epoch": 0.86, "learning_rate": 3.0969569144214147e-07, "logits/chosen": 0.11611845344305038, "logits/rejected": 0.13405680656433105, "logps/chosen": -521.346435546875, "logps/rejected": -532.4287719726562, "loss": 0.0293, "rewards/accuracies": 0.5708333849906921, "rewards/chosen": -0.26295241713523865, "rewards/margins": 0.04004523903131485, "rewards/rejected": -0.3029976785182953, "step": 2180 }, { "epoch": 0.86, "learning_rate": 2.933822124594124e-07, "logits/chosen": 0.13651199638843536, "logits/rejected": 0.14987996220588684, "logps/chosen": -579.0687866210938, "logps/rejected": -603.724609375, "loss": 0.0377, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": -0.2872801423072815, "rewards/margins": 0.05356990173459053, "rewards/rejected": -0.34085002541542053, "step": 2190 }, { "epoch": 0.86, "learning_rate": 2.774832967588556e-07, "logits/chosen": 0.10344930738210678, "logits/rejected": 0.1647767573595047, "logps/chosen": -556.7415771484375, "logps/rejected": -550.4581298828125, "loss": 0.0414, "rewards/accuracies": 0.5583333373069763, "rewards/chosen": -0.26868194341659546, "rewards/margins": 0.05047239735722542, "rewards/rejected": -0.3191543221473694, "step": 2200 }, { "epoch": 0.87, "learning_rate": 2.6200193131298376e-07, "logits/chosen": 0.12514041364192963, "logits/rejected": 0.12365315854549408, "logps/chosen": -553.3133544921875, "logps/rejected": -573.3492431640625, "loss": 0.0293, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.2692439556121826, "rewards/margins": 0.05325891822576523, "rewards/rejected": -0.32250285148620605, "step": 2210 }, { "epoch": 0.87, "learning_rate": 2.469410246480067e-07, "logits/chosen": 0.07550040632486343, "logits/rejected": 0.16558191180229187, "logps/chosen": -587.4112548828125, "logps/rejected": -619.96826171875, "loss": 0.0337, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.29542380571365356, "rewards/margins": 0.06875937432050705, "rewards/rejected": -0.3641831576824188, "step": 2220 }, { "epoch": 0.88, "learning_rate": 2.3230340629740166e-07, "logits/chosen": 0.14227424561977386, "logits/rejected": 0.07906799018383026, "logps/chosen": -546.7794189453125, "logps/rejected": -545.5219116210938, "loss": 0.0365, "rewards/accuracies": 0.5458333492279053, "rewards/chosen": -0.2760690450668335, "rewards/margins": 0.025999590754508972, "rewards/rejected": -0.3020685911178589, "step": 2230 }, { "epoch": 0.88, "learning_rate": 2.1809182627031883e-07, "logits/chosen": 0.13371774554252625, "logits/rejected": 0.1723383665084839, "logps/chosen": -586.3175048828125, "logps/rejected": -594.6130981445312, "loss": 0.0351, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.2775975465774536, "rewards/margins": 0.04781870171427727, "rewards/rejected": -0.3254162669181824, "step": 2240 }, { "epoch": 0.88, "learning_rate": 2.0430895453492944e-07, "logits/chosen": 0.08875533938407898, "logits/rejected": 0.16999275982379913, "logps/chosen": -580.5711669921875, "logps/rejected": -608.8607788085938, "loss": 0.0398, "rewards/accuracies": 0.6291667222976685, "rewards/chosen": -0.2822466790676117, "rewards/margins": 0.047433655709028244, "rewards/rejected": -0.32968032360076904, "step": 2250 }, { "epoch": 0.89, "learning_rate": 1.9095738051681412e-07, "logits/chosen": 0.19837184250354767, "logits/rejected": 0.16092568635940552, "logps/chosen": -592.68701171875, "logps/rejected": -623.7216796875, "loss": 0.0483, "rewards/accuracies": 0.6791666746139526, "rewards/chosen": -0.29589009284973145, "rewards/margins": 0.05754246562719345, "rewards/rejected": -0.3534325063228607, "step": 2260 }, { "epoch": 0.89, "learning_rate": 1.7803961261247864e-07, "logits/chosen": 0.16064941883087158, "logits/rejected": 0.13991779088974, "logps/chosen": -573.6605834960938, "logps/rejected": -612.8917846679688, "loss": 0.0378, "rewards/accuracies": 0.6666667461395264, "rewards/chosen": -0.2881428599357605, "rewards/margins": 0.06800989806652069, "rewards/rejected": -0.3561527132987976, "step": 2270 }, { "epoch": 0.89, "learning_rate": 1.6555807771809375e-07, "logits/chosen": 0.10939987748861313, "logits/rejected": 0.08681745082139969, "logps/chosen": -572.9388427734375, "logps/rejected": -569.3970947265625, "loss": 0.0373, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2807319760322571, "rewards/margins": 0.047644276171922684, "rewards/rejected": -0.32837623357772827, "step": 2280 }, { "epoch": 0.9, "learning_rate": 1.5351512077355024e-07, "logits/chosen": 0.0837242603302002, "logits/rejected": 0.1572417914867401, "logps/chosen": -587.7030029296875, "logps/rejected": -579.8245239257812, "loss": 0.0388, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2788943350315094, "rewards/margins": 0.04011840373277664, "rewards/rejected": -0.31901273131370544, "step": 2290 }, { "epoch": 0.9, "learning_rate": 1.4191300432190634e-07, "logits/chosen": 0.04972783476114273, "logits/rejected": 0.2529766261577606, "logps/chosen": -563.1866455078125, "logps/rejected": -596.42578125, "loss": 0.0425, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.29752233624458313, "rewards/margins": 0.0639050304889679, "rewards/rejected": -0.36142733693122864, "step": 2300 }, { "epoch": 0.91, "learning_rate": 1.3075390808431897e-07, "logits/chosen": 0.11287392675876617, "logits/rejected": 0.11052433401346207, "logps/chosen": -533.6840209960938, "logps/rejected": -540.5704956054688, "loss": 0.0392, "rewards/accuracies": 0.5916666984558105, "rewards/chosen": -0.27624043822288513, "rewards/margins": 0.0406770259141922, "rewards/rejected": -0.3169174790382385, "step": 2310 }, { "epoch": 0.91, "learning_rate": 1.2003992855053326e-07, "logits/chosen": 0.13198883831501007, "logits/rejected": 0.1386340856552124, "logps/chosen": -547.6902465820312, "logps/rejected": -601.7785034179688, "loss": 0.0342, "rewards/accuracies": 0.5916666984558105, "rewards/chosen": -0.28226011991500854, "rewards/margins": 0.05512434244155884, "rewards/rejected": -0.337384432554245, "step": 2320 }, { "epoch": 0.91, "learning_rate": 1.0977307858500818e-07, "logits/chosen": 0.1405051052570343, "logits/rejected": 0.12027152627706528, "logps/chosen": -545.7700805664062, "logps/rejected": -588.6355590820312, "loss": 0.0343, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2875562608242035, "rewards/margins": 0.052831798791885376, "rewards/rejected": -0.34038805961608887, "step": 2330 }, { "epoch": 0.92, "learning_rate": 9.995528704875635e-08, "logits/chosen": 0.12197569757699966, "logits/rejected": 0.025906020775437355, "logps/chosen": -571.2064819335938, "logps/rejected": -597.6893310546875, "loss": 0.0347, "rewards/accuracies": 0.6375000476837158, "rewards/chosen": -0.28911882638931274, "rewards/margins": 0.054231010377407074, "rewards/rejected": -0.3433498442173004, "step": 2340 }, { "epoch": 0.92, "learning_rate": 9.058839843696237e-08, "logits/chosen": 0.09433220326900482, "logits/rejected": 0.20620909333229065, "logps/chosen": -555.5418701171875, "logps/rejected": -581.8546142578125, "loss": 0.0351, "rewards/accuracies": 0.5916666984558105, "rewards/chosen": -0.2735230326652527, "rewards/margins": 0.05654352903366089, "rewards/rejected": -0.33006659150123596, "step": 2350 }, { "epoch": 0.93, "learning_rate": 8.167417253245213e-08, "logits/chosen": 0.08379559963941574, "logits/rejected": 0.15960299968719482, "logps/chosen": -575.5586547851562, "logps/rejected": -596.3479614257812, "loss": 0.0374, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.29097455739974976, "rewards/margins": 0.043259985744953156, "rewards/rejected": -0.3342345356941223, "step": 2360 }, { "epoch": 0.93, "learning_rate": 7.32142840750788e-08, "logits/chosen": 0.15721924602985382, "logits/rejected": 0.11612733453512192, "logps/chosen": -598.3759155273438, "logps/rejected": -604.84228515625, "loss": 0.036, "rewards/accuracies": 0.595833420753479, "rewards/chosen": -0.28792813420295715, "rewards/margins": 0.046954017132520676, "rewards/rejected": -0.3348821699619293, "step": 2370 }, { "epoch": 0.93, "learning_rate": 6.521032244708375e-08, "logits/chosen": 0.1785806119441986, "logits/rejected": 0.12633617222309113, "logps/chosen": -576.2505493164062, "logps/rejected": -601.0803833007812, "loss": 0.0395, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.2926011085510254, "rewards/margins": 0.04950517788529396, "rewards/rejected": -0.34210631251335144, "step": 2380 }, { "epoch": 0.94, "learning_rate": 5.766379137449624e-08, "logits/chosen": 0.09354978054761887, "logits/rejected": 0.20856766402721405, "logps/chosen": -558.0820922851562, "logps/rejected": -587.3670043945312, "loss": 0.0278, "rewards/accuracies": 0.6291667222976685, "rewards/chosen": -0.2841167449951172, "rewards/margins": 0.054156333208084106, "rewards/rejected": -0.3382730782032013, "step": 2390 }, { "epoch": 0.94, "learning_rate": 5.0576108644623536e-08, "logits/chosen": 0.13919702172279358, "logits/rejected": 0.19555795192718506, "logps/chosen": -620.8248901367188, "logps/rejected": -609.3353271484375, "loss": 0.035, "rewards/accuracies": 0.625, "rewards/chosen": -0.2929002642631531, "rewards/margins": 0.04485376924276352, "rewards/rejected": -0.3377540707588196, "step": 2400 }, { "epoch": 0.95, "learning_rate": 4.394860583968624e-08, "logits/chosen": 0.1323954313993454, "logits/rejected": 0.18047359585762024, "logps/chosen": -545.4381103515625, "logps/rejected": -573.6661376953125, "loss": 0.0465, "rewards/accuracies": 0.5750000476837158, "rewards/chosen": -0.2781949043273926, "rewards/margins": 0.049311086535453796, "rewards/rejected": -0.3275059759616852, "step": 2410 }, { "epoch": 0.95, "learning_rate": 3.778252808665284e-08, "logits/chosen": 0.13182419538497925, "logits/rejected": 0.13190338015556335, "logps/chosen": -615.33935546875, "logps/rejected": -599.4812622070312, "loss": 0.0248, "rewards/accuracies": 0.6291667222976685, "rewards/chosen": -0.28227499127388, "rewards/margins": 0.04690806567668915, "rewards/rejected": -0.32918307185173035, "step": 2420 }, { "epoch": 0.95, "learning_rate": 3.207903382331262e-08, "logits/chosen": 0.1339016705751419, "logits/rejected": 0.1503659188747406, "logps/chosen": -600.3336181640625, "logps/rejected": -609.2835693359375, "loss": 0.0367, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.29118743538856506, "rewards/margins": 0.05089714378118515, "rewards/rejected": -0.3420846164226532, "step": 2430 }, { "epoch": 0.96, "learning_rate": 2.683919458063705e-08, "logits/chosen": 0.11768970638513565, "logits/rejected": 0.08185064047574997, "logps/chosen": -548.324462890625, "logps/rejected": -555.4493408203125, "loss": 0.0351, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.27761414647102356, "rewards/margins": 0.048674218356609344, "rewards/rejected": -0.3262883424758911, "step": 2440 }, { "epoch": 0.96, "learning_rate": 2.2063994781468256e-08, "logits/chosen": 0.1320245862007141, "logits/rejected": 0.11001825332641602, "logps/chosen": -556.1405029296875, "logps/rejected": -561.6322631835938, "loss": 0.038, "rewards/accuracies": 0.5791666507720947, "rewards/chosen": -0.2681787610054016, "rewards/margins": 0.052210353314876556, "rewards/rejected": -0.3203890919685364, "step": 2450 }, { "epoch": 0.97, "learning_rate": 1.7754331555573656e-08, "logits/chosen": 0.12108851969242096, "logits/rejected": 0.1010405421257019, "logps/chosen": -575.8355712890625, "logps/rejected": -592.566162109375, "loss": 0.0462, "rewards/accuracies": 0.595833420753479, "rewards/chosen": -0.2866538465023041, "rewards/margins": 0.038824331015348434, "rewards/rejected": -0.325478196144104, "step": 2460 }, { "epoch": 0.97, "learning_rate": 1.3911014571098835e-08, "logits/chosen": 0.12808892130851746, "logits/rejected": 0.16563333570957184, "logps/chosen": -607.0869140625, "logps/rejected": -590.3297119140625, "loss": 0.0351, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": -0.29146137833595276, "rewards/margins": 0.04118754714727402, "rewards/rejected": -0.332648903131485, "step": 2470 }, { "epoch": 0.97, "learning_rate": 1.0534765882453113e-08, "logits/chosen": 0.13690322637557983, "logits/rejected": 0.13680371642112732, "logps/chosen": -574.5492553710938, "logps/rejected": -587.0123291015625, "loss": 0.0335, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -0.27521491050720215, "rewards/margins": 0.052486516535282135, "rewards/rejected": -0.3277014195919037, "step": 2480 }, { "epoch": 0.98, "learning_rate": 7.626219794655553e-09, "logits/chosen": 0.10713358223438263, "logits/rejected": 0.11065838485956192, "logps/chosen": -582.7445068359375, "logps/rejected": -612.7381591796875, "loss": 0.0351, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": -0.2857518494129181, "rewards/margins": 0.0499836802482605, "rewards/rejected": -0.3357354998588562, "step": 2490 }, { "epoch": 0.98, "learning_rate": 5.185922744166128e-09, "logits/chosen": 0.12372003495693207, "logits/rejected": 0.19938071072101593, "logps/chosen": -613.9265747070312, "logps/rejected": -604.8402099609375, "loss": 0.0258, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.2945869565010071, "rewards/margins": 0.04543297737836838, "rewards/rejected": -0.34001994132995605, "step": 2500 }, { "epoch": 0.99, "learning_rate": 3.2143331962256053e-09, "logits/chosen": 0.055711567401885986, "logits/rejected": 0.17630581557750702, "logps/chosen": -589.8251342773438, "logps/rejected": -600.02880859375, "loss": 0.0383, "rewards/accuracies": 0.5708333253860474, "rewards/chosen": -0.2909054160118103, "rewards/margins": 0.03341306746006012, "rewards/rejected": -0.32431846857070923, "step": 2510 }, { "epoch": 0.99, "learning_rate": 1.711821558721405e-09, "logits/chosen": 0.11054827272891998, "logits/rejected": 0.1433221995830536, "logps/chosen": -546.5811767578125, "logps/rejected": -569.87646484375, "loss": 0.0332, "rewards/accuracies": 0.6375000476837158, "rewards/chosen": -0.26730093359947205, "rewards/margins": 0.05267611891031265, "rewards/rejected": -0.3199770748615265, "step": 2520 }, { "epoch": 0.99, "learning_rate": 6.786701125999218e-10, "logits/chosen": 0.12685494124889374, "logits/rejected": 0.12306642532348633, "logps/chosen": -550.8919677734375, "logps/rejected": -557.5734252929688, "loss": 0.0368, "rewards/accuracies": 0.5708333253860474, "rewards/chosen": -0.27823224663734436, "rewards/margins": 0.04585646465420723, "rewards/rejected": -0.3240886926651001, "step": 2530 }, { "epoch": 1.0, "learning_rate": 1.1507295883145253e-10, "logits/chosen": 0.05342329666018486, "logits/rejected": 0.14736703038215637, "logps/chosen": -567.9596557617188, "logps/rejected": -585.9571533203125, "loss": 0.0264, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": -0.27421218156814575, "rewards/margins": 0.046621158719062805, "rewards/rejected": -0.320833295583725, "step": 2540 }, { "epoch": 1.0, "step": 2547, "total_flos": 0.0, "train_loss": 0.04083743233644686, "train_runtime": 25203.2602, "train_samples_per_second": 2.426, "train_steps_per_second": 0.101 } ], "logging_steps": 10, "max_steps": 2547, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }