{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981298423724285, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021373230029388193, "grad_norm": 14.616270006854805, "learning_rate": 2.127659574468085e-08, "logits/chosen": -1.1381689310073853, "logits/rejected": -0.9913416504859924, "logps/chosen": -0.2839311361312866, "logps/rejected": -0.29555341601371765, "loss": 1.608, "rewards/accuracies": 0.625, "rewards/chosen": -0.7098277807235718, "rewards/margins": 0.029055725783109665, "rewards/rejected": -0.7388835549354553, "step": 1 }, { "epoch": 0.010686615014694095, "grad_norm": 15.798664787929345, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -0.989432692527771, "logits/rejected": -0.9188639521598816, "logps/chosen": -0.26978519558906555, "logps/rejected": -0.26858454942703247, "loss": 1.6205, "rewards/accuracies": 0.546875, "rewards/chosen": -0.6744629740715027, "rewards/margins": -0.0030015837401151657, "rewards/rejected": -0.6714614033699036, "step": 5 }, { "epoch": 0.02137323002938819, "grad_norm": 12.624563810334125, "learning_rate": 2.127659574468085e-07, "logits/chosen": -0.9885454177856445, "logits/rejected": -0.9444629549980164, "logps/chosen": -0.27237212657928467, "logps/rejected": -0.2738865911960602, "loss": 1.6069, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.6809303164482117, "rewards/margins": 0.0037860602606087923, "rewards/rejected": -0.6847164630889893, "step": 10 }, { "epoch": 0.03205984504408229, "grad_norm": 8.372527145409705, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9621208906173706, "logits/rejected": -0.9169891476631165, "logps/chosen": -0.2949882745742798, "logps/rejected": -0.2834514081478119, "loss": 1.6049, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.7374706864356995, "rewards/margins": -0.028842147439718246, "rewards/rejected": -0.7086285352706909, "step": 15 }, { "epoch": 0.04274646005877638, "grad_norm": 11.776984926293357, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9825040102005005, "logits/rejected": -0.983070969581604, "logps/chosen": -0.26186102628707886, "logps/rejected": -0.26891934871673584, "loss": 1.6132, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.65465247631073, "rewards/margins": 0.01764589548110962, "rewards/rejected": -0.6722984313964844, "step": 20 }, { "epoch": 0.053433075073470476, "grad_norm": 11.594406861763694, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.0304547548294067, "logits/rejected": -1.0080630779266357, "logps/chosen": -0.28237098455429077, "logps/rejected": -0.28737810254096985, "loss": 1.6046, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.7059274911880493, "rewards/margins": 0.012517772614955902, "rewards/rejected": -0.7184451818466187, "step": 25 }, { "epoch": 0.06411969008816458, "grad_norm": 13.181478771123727, "learning_rate": 6.382978723404255e-07, "logits/chosen": -1.0607410669326782, "logits/rejected": -0.9752001762390137, "logps/chosen": -0.2747865319252014, "logps/rejected": -0.27676859498023987, "loss": 1.6169, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.6869663596153259, "rewards/margins": 0.004955160431563854, "rewards/rejected": -0.6919214725494385, "step": 30 }, { "epoch": 0.07480630510285867, "grad_norm": 20.535555442364025, "learning_rate": 7.446808510638297e-07, "logits/chosen": -0.9909561276435852, "logits/rejected": -0.9133442044258118, "logps/chosen": -0.2775736451148987, "logps/rejected": -0.3019401431083679, "loss": 1.6019, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6939341425895691, "rewards/margins": 0.06091625243425369, "rewards/rejected": -0.754850447177887, "step": 35 }, { "epoch": 0.08549292011755276, "grad_norm": 24.246139127435896, "learning_rate": 8.51063829787234e-07, "logits/chosen": -0.9217761158943176, "logits/rejected": -0.9104591608047485, "logps/chosen": -0.2786272466182709, "logps/rejected": -0.29105645418167114, "loss": 1.617, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.6965680718421936, "rewards/margins": 0.031073052436113358, "rewards/rejected": -0.7276411056518555, "step": 40 }, { "epoch": 0.09617953513224686, "grad_norm": 10.340400390143392, "learning_rate": 9.574468085106384e-07, "logits/chosen": -0.918626606464386, "logits/rejected": -0.837913990020752, "logps/chosen": -0.33493560552597046, "logps/rejected": -0.33847588300704956, "loss": 1.6012, "rewards/accuracies": 0.5, "rewards/chosen": -0.8373388051986694, "rewards/margins": 0.00885077752172947, "rewards/rejected": -0.8461896777153015, "step": 45 }, { "epoch": 0.10686615014694095, "grad_norm": 9.374774032732045, "learning_rate": 9.998741174712533e-07, "logits/chosen": -0.9050552248954773, "logits/rejected": -0.9147823452949524, "logps/chosen": -0.29951199889183044, "logps/rejected": -0.33792784810066223, "loss": 1.5734, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7487800717353821, "rewards/margins": 0.09603960067033768, "rewards/rejected": -0.8448196649551392, "step": 50 }, { "epoch": 0.11755276516163506, "grad_norm": 12.319781154359692, "learning_rate": 9.991050648838675e-07, "logits/chosen": -0.917604923248291, "logits/rejected": -0.859523594379425, "logps/chosen": -0.2723899185657501, "logps/rejected": -0.3260456621646881, "loss": 1.5698, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6809747815132141, "rewards/margins": 0.13413934409618378, "rewards/rejected": -0.8151141405105591, "step": 55 }, { "epoch": 0.12823938017632916, "grad_norm": 10.399641038869387, "learning_rate": 9.97637968732563e-07, "logits/chosen": -0.9515789151191711, "logits/rejected": -0.9302376508712769, "logps/chosen": -0.28720152378082275, "logps/rejected": -0.3024197816848755, "loss": 1.5846, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7180038690567017, "rewards/margins": 0.03804563358426094, "rewards/rejected": -0.7560494542121887, "step": 60 }, { "epoch": 0.13892599519102325, "grad_norm": 11.022271869662724, "learning_rate": 9.954748808839674e-07, "logits/chosen": -0.943594753742218, "logits/rejected": -0.9900406002998352, "logps/chosen": -0.28493180871009827, "logps/rejected": -0.28960293531417847, "loss": 1.584, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.7123295664787292, "rewards/margins": 0.01167784072458744, "rewards/rejected": -0.7240074276924133, "step": 65 }, { "epoch": 0.14961261020571734, "grad_norm": 16.92128796829008, "learning_rate": 9.926188266120295e-07, "logits/chosen": -0.9900282025337219, "logits/rejected": -0.9590619802474976, "logps/chosen": -0.347956120967865, "logps/rejected": -0.3921958804130554, "loss": 1.5789, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.8698902130126953, "rewards/margins": 0.11059943586587906, "rewards/rejected": -0.9804896116256714, "step": 70 }, { "epoch": 0.16029922522041143, "grad_norm": 16.614347339411598, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9594413042068481, "logits/rejected": -0.9471073150634766, "logps/chosen": -0.3611677289009094, "logps/rejected": -0.4440728724002838, "loss": 1.5386, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9029192924499512, "rewards/margins": 0.20726287364959717, "rewards/rejected": -1.1101821660995483, "step": 75 }, { "epoch": 0.17098584023510552, "grad_norm": 11.76758635779136, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.9906526803970337, "logits/rejected": -0.9735655784606934, "logps/chosen": -0.32699793577194214, "logps/rejected": -0.40725231170654297, "loss": 1.5433, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.8174948692321777, "rewards/margins": 0.20063595473766327, "rewards/rejected": -1.018130898475647, "step": 80 }, { "epoch": 0.18167245524979964, "grad_norm": 10.051730020561614, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9377920031547546, "logits/rejected": -0.889153003692627, "logps/chosen": -0.35584911704063416, "logps/rejected": -0.403181254863739, "loss": 1.5549, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8896228671073914, "rewards/margins": 0.11833026260137558, "rewards/rejected": -1.0079530477523804, "step": 85 }, { "epoch": 0.19235907026449373, "grad_norm": 12.299894230325533, "learning_rate": 9.743592451943998e-07, "logits/chosen": -0.9021016955375671, "logits/rejected": -0.8429332971572876, "logps/chosen": -0.3575811982154846, "logps/rejected": -0.5039599537849426, "loss": 1.5625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8939528465270996, "rewards/margins": 0.36594703793525696, "rewards/rejected": -1.2598999738693237, "step": 90 }, { "epoch": 0.20304568527918782, "grad_norm": 14.145971876119047, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.002209186553955, "logits/rejected": -1.0227996110916138, "logps/chosen": -0.41834840178489685, "logps/rejected": -0.47105270624160767, "loss": 1.5299, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0458710193634033, "rewards/margins": 0.13176079094409943, "rewards/rejected": -1.1776319742202759, "step": 95 }, { "epoch": 0.2137323002938819, "grad_norm": 16.666230206302217, "learning_rate": 9.612209208833646e-07, "logits/chosen": -1.0650508403778076, "logits/rejected": -1.0104029178619385, "logps/chosen": -0.4353299140930176, "logps/rejected": -0.4307008385658264, "loss": 1.5889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.088324785232544, "rewards/margins": -0.011572673916816711, "rewards/rejected": -1.076751947402954, "step": 100 }, { "epoch": 0.224418915308576, "grad_norm": 17.277037383709768, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.9813326597213745, "logits/rejected": -0.9346880912780762, "logps/chosen": -0.3515966534614563, "logps/rejected": -0.46131715178489685, "loss": 1.5543, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8789916038513184, "rewards/margins": 0.2743012309074402, "rewards/rejected": -1.1532928943634033, "step": 105 }, { "epoch": 0.2351055303232701, "grad_norm": 12.743189576436166, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.8806193470954895, "logits/rejected": -0.8356341123580933, "logps/chosen": -0.39990124106407166, "logps/rejected": -0.42553144693374634, "loss": 1.5602, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.9997529983520508, "rewards/margins": 0.06407558172941208, "rewards/rejected": -1.063828706741333, "step": 110 }, { "epoch": 0.2457921453379642, "grad_norm": 13.62783373071176, "learning_rate": 9.367041003085648e-07, "logits/chosen": -0.9096847772598267, "logits/rejected": -0.9085425138473511, "logps/chosen": -0.40596961975097656, "logps/rejected": -0.4234519898891449, "loss": 1.5452, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.0149240493774414, "rewards/margins": 0.043705932796001434, "rewards/rejected": -1.0586299896240234, "step": 115 }, { "epoch": 0.2564787603526583, "grad_norm": 12.129891786309633, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.9741169810295105, "logits/rejected": -0.9402868151664734, "logps/chosen": -0.3675619959831238, "logps/rejected": -0.4012192189693451, "loss": 1.5295, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.9189050793647766, "rewards/margins": 0.08414317667484283, "rewards/rejected": -1.0030481815338135, "step": 120 }, { "epoch": 0.2671653753673524, "grad_norm": 12.760172929151745, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.014088749885559, "logits/rejected": -1.007265567779541, "logps/chosen": -0.47613659501075745, "logps/rejected": -0.5860797166824341, "loss": 1.5674, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1903413534164429, "rewards/margins": 0.2748578190803528, "rewards/rejected": -1.4651992321014404, "step": 125 }, { "epoch": 0.2778519903820465, "grad_norm": 14.619403511233878, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.0016984939575195, "logits/rejected": -1.0274698734283447, "logps/chosen": -0.4267544746398926, "logps/rejected": -0.5695111155509949, "loss": 1.5349, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.0668861865997314, "rewards/margins": 0.35689178109169006, "rewards/rejected": -1.4237778186798096, "step": 130 }, { "epoch": 0.2885386053967406, "grad_norm": 10.619279201046492, "learning_rate": 8.955355173281707e-07, "logits/chosen": -0.9727472066879272, "logits/rejected": -0.9219390749931335, "logps/chosen": -0.39321380853652954, "logps/rejected": -0.4508630633354187, "loss": 1.4978, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.9830344915390015, "rewards/margins": 0.1441231220960617, "rewards/rejected": -1.1271576881408691, "step": 135 }, { "epoch": 0.2992252204114347, "grad_norm": 15.834369827499174, "learning_rate": 8.838223701790055e-07, "logits/chosen": -0.967811107635498, "logits/rejected": -0.9583989977836609, "logps/chosen": -0.43714672327041626, "logps/rejected": -0.5161387920379639, "loss": 1.5218, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0928667783737183, "rewards/margins": 0.19748012721538544, "rewards/rejected": -1.2903468608856201, "step": 140 }, { "epoch": 0.30991183542612877, "grad_norm": 12.847642374810391, "learning_rate": 8.71572412738697e-07, "logits/chosen": -0.9621770977973938, "logits/rejected": -0.9567694664001465, "logps/chosen": -0.43871012330055237, "logps/rejected": -0.4793704152107239, "loss": 1.5379, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.0967752933502197, "rewards/margins": 0.10165063291788101, "rewards/rejected": -1.1984260082244873, "step": 145 }, { "epoch": 0.32059845044082286, "grad_norm": 11.935024879482611, "learning_rate": 8.588027776804058e-07, "logits/chosen": -0.8888929486274719, "logits/rejected": -0.8930709958076477, "logps/chosen": -0.5125688910484314, "logps/rejected": -0.722069263458252, "loss": 1.5225, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2814221382141113, "rewards/margins": 0.5237509608268738, "rewards/rejected": -1.8051731586456299, "step": 150 }, { "epoch": 0.33128506545551695, "grad_norm": 16.764961972010557, "learning_rate": 8.455313244934324e-07, "logits/chosen": -0.9782350659370422, "logits/rejected": -0.9396332502365112, "logps/chosen": -0.5236457586288452, "logps/rejected": -0.7281317710876465, "loss": 1.4981, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.309114694595337, "rewards/margins": 0.5112148523330688, "rewards/rejected": -1.8203294277191162, "step": 155 }, { "epoch": 0.34197168047021104, "grad_norm": 11.439957416651609, "learning_rate": 8.317766145051057e-07, "logits/chosen": -0.9513300061225891, "logits/rejected": -0.9507732391357422, "logps/chosen": -0.4817509651184082, "logps/rejected": -0.6095650792121887, "loss": 1.5319, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2043774127960205, "rewards/margins": 0.3195350468158722, "rewards/rejected": -1.5239124298095703, "step": 160 }, { "epoch": 0.3526582954849052, "grad_norm": 15.127711801028411, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.0031640529632568, "logits/rejected": -0.9406889081001282, "logps/chosen": -0.5859075784683228, "logps/rejected": -0.6615421175956726, "loss": 1.5097, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4647690057754517, "rewards/margins": 0.18908634781837463, "rewards/rejected": -1.6538550853729248, "step": 165 }, { "epoch": 0.36334491049959927, "grad_norm": 19.457497723556852, "learning_rate": 8.028950219204099e-07, "logits/chosen": -0.9729808568954468, "logits/rejected": -0.9703952670097351, "logps/chosen": -0.5817127823829651, "logps/rejected": -0.7540786266326904, "loss": 1.52, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4542819261550903, "rewards/margins": 0.4309147298336029, "rewards/rejected": -1.8851966857910156, "step": 170 }, { "epoch": 0.37403152551429336, "grad_norm": 13.673431328128673, "learning_rate": 7.878085328428368e-07, "logits/chosen": -0.9736520051956177, "logits/rejected": -0.9835022687911987, "logps/chosen": -0.6428834795951843, "logps/rejected": -0.8455599546432495, "loss": 1.4591, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6072088479995728, "rewards/margins": 0.5066913366317749, "rewards/rejected": -2.1139001846313477, "step": 175 }, { "epoch": 0.38471814052898745, "grad_norm": 20.186619677768963, "learning_rate": 7.723195175075135e-07, "logits/chosen": -0.9989040493965149, "logits/rejected": -0.9996326565742493, "logps/chosen": -0.6329335570335388, "logps/rejected": -0.7996684312820435, "loss": 1.5042, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5823338031768799, "rewards/margins": 0.41683727502822876, "rewards/rejected": -1.999171257019043, "step": 180 }, { "epoch": 0.39540475554368154, "grad_norm": 20.89728549129117, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.0245858430862427, "logits/rejected": -0.9974483251571655, "logps/chosen": -0.5426946878433228, "logps/rejected": -0.765714168548584, "loss": 1.5357, "rewards/accuracies": 0.625, "rewards/chosen": -1.3567368984222412, "rewards/margins": 0.5575486421585083, "rewards/rejected": -1.914285659790039, "step": 185 }, { "epoch": 0.40609137055837563, "grad_norm": 14.284349268241145, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.0203882455825806, "logits/rejected": -1.002201795578003, "logps/chosen": -0.5420663356781006, "logps/rejected": -0.8374710083007812, "loss": 1.4458, "rewards/accuracies": 0.6875, "rewards/chosen": -1.355165719985962, "rewards/margins": 0.7385115623474121, "rewards/rejected": -2.093677282333374, "step": 190 }, { "epoch": 0.4167779855730697, "grad_norm": 21.006500802721117, "learning_rate": 7.236565741578162e-07, "logits/chosen": -0.9360872507095337, "logits/rejected": -0.9267364740371704, "logps/chosen": -0.6582534909248352, "logps/rejected": -0.9019840359687805, "loss": 1.496, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6456336975097656, "rewards/margins": 0.6093264818191528, "rewards/rejected": -2.254960298538208, "step": 195 }, { "epoch": 0.4274646005877638, "grad_norm": 17.34001688115439, "learning_rate": 7.067792524832603e-07, "logits/chosen": -0.9733101725578308, "logits/rejected": -0.9737011194229126, "logps/chosen": -0.5885142087936401, "logps/rejected": -0.759198784828186, "loss": 1.4947, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4712855815887451, "rewards/margins": 0.42671123147010803, "rewards/rejected": -1.8979966640472412, "step": 200 }, { "epoch": 0.4381512156024579, "grad_norm": 18.93087179194272, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.0645233392715454, "logits/rejected": -1.0247756242752075, "logps/chosen": -0.7058154940605164, "logps/rejected": -0.8320499658584595, "loss": 1.481, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7645390033721924, "rewards/margins": 0.3155860900878906, "rewards/rejected": -2.080124855041504, "step": 205 }, { "epoch": 0.448837830617152, "grad_norm": 20.51304681116505, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.0714137554168701, "logits/rejected": -0.9989116787910461, "logps/chosen": -0.6715008020401001, "logps/rejected": -0.8458214998245239, "loss": 1.4931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6787519454956055, "rewards/margins": 0.43580159544944763, "rewards/rejected": -2.114553451538086, "step": 210 }, { "epoch": 0.45952444563184613, "grad_norm": 16.46301637400639, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.0320379734039307, "logits/rejected": -0.9812155961990356, "logps/chosen": -0.7699872255325317, "logps/rejected": -0.9591902494430542, "loss": 1.4357, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9249680042266846, "rewards/margins": 0.47300752997398376, "rewards/rejected": -2.397975444793701, "step": 215 }, { "epoch": 0.4702110606465402, "grad_norm": 21.088635750930834, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.0327280759811401, "logits/rejected": -0.9874359965324402, "logps/chosen": -0.8896724581718445, "logps/rejected": -1.0073192119598389, "loss": 1.4954, "rewards/accuracies": 0.625, "rewards/chosen": -2.2241809368133545, "rewards/margins": 0.2941167950630188, "rewards/rejected": -2.5182979106903076, "step": 220 }, { "epoch": 0.4808976756612343, "grad_norm": 19.02142810522417, "learning_rate": 6.185401888577487e-07, "logits/chosen": -0.9864276051521301, "logits/rejected": -0.9660781621932983, "logps/chosen": -0.772359311580658, "logps/rejected": -0.794577956199646, "loss": 1.557, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9308984279632568, "rewards/margins": 0.0555465929210186, "rewards/rejected": -1.9864448308944702, "step": 225 }, { "epoch": 0.4915842906759284, "grad_norm": 17.867436451445265, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.0150645971298218, "logits/rejected": -0.9232436418533325, "logps/chosen": -0.8819006085395813, "logps/rejected": -1.0791282653808594, "loss": 1.4692, "rewards/accuracies": 0.625, "rewards/chosen": -2.2047512531280518, "rewards/margins": 0.49306946992874146, "rewards/rejected": -2.6978209018707275, "step": 230 }, { "epoch": 0.5022709056906225, "grad_norm": 19.261147315812543, "learning_rate": 5.819089557075688e-07, "logits/chosen": -0.996496319770813, "logits/rejected": -0.9518272280693054, "logps/chosen": -0.8440850377082825, "logps/rejected": -0.9660031199455261, "loss": 1.4942, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.110212564468384, "rewards/margins": 0.30479517579078674, "rewards/rejected": -2.4150078296661377, "step": 235 }, { "epoch": 0.5129575207053166, "grad_norm": 18.32103206829717, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.0355093479156494, "logits/rejected": -1.006392002105713, "logps/chosen": -0.7564712762832642, "logps/rejected": -0.9403325319290161, "loss": 1.4046, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8911781311035156, "rewards/margins": 0.4596532881259918, "rewards/rejected": -2.3508315086364746, "step": 240 }, { "epoch": 0.5236441357200107, "grad_norm": 13.624682866914808, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.0392498970031738, "logits/rejected": -1.0080206394195557, "logps/chosen": -0.7070968151092529, "logps/rejected": -0.8767108917236328, "loss": 1.4407, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.7677417993545532, "rewards/margins": 0.4240352213382721, "rewards/rejected": -2.191777229309082, "step": 245 }, { "epoch": 0.5343307507347048, "grad_norm": 22.22843267157582, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.048105001449585, "logits/rejected": -1.028188943862915, "logps/chosen": -0.8559715151786804, "logps/rejected": -1.2052185535430908, "loss": 1.467, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1399290561676025, "rewards/margins": 0.8731171488761902, "rewards/rejected": -3.0130460262298584, "step": 250 }, { "epoch": 0.5450173657493989, "grad_norm": 21.7502246417763, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.0093282461166382, "logits/rejected": -1.0085898637771606, "logps/chosen": -0.8086752891540527, "logps/rejected": -0.9931262731552124, "loss": 1.4101, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.021688222885132, "rewards/margins": 0.4611276090145111, "rewards/rejected": -2.4828155040740967, "step": 255 }, { "epoch": 0.555703980764093, "grad_norm": 17.422008542886676, "learning_rate": 4.887809678520975e-07, "logits/chosen": -0.9935086369514465, "logits/rejected": -0.9804226160049438, "logps/chosen": -0.7468116879463196, "logps/rejected": -1.0926599502563477, "loss": 1.4384, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8670291900634766, "rewards/margins": 0.8646209836006165, "rewards/rejected": -2.731649875640869, "step": 260 }, { "epoch": 0.566390595778787, "grad_norm": 21.42359137838811, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.0343437194824219, "logits/rejected": -1.0285662412643433, "logps/chosen": -0.8779309391975403, "logps/rejected": -0.9997223019599915, "loss": 1.4835, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.19482684135437, "rewards/margins": 0.3044784963130951, "rewards/rejected": -2.4993057250976562, "step": 265 }, { "epoch": 0.5770772107934812, "grad_norm": 27.361650975575174, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.022482991218567, "logits/rejected": -1.030027151107788, "logps/chosen": -0.8233085870742798, "logps/rejected": -1.0775353908538818, "loss": 1.4746, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.058271646499634, "rewards/margins": 0.6355669498443604, "rewards/rejected": -2.693838596343994, "step": 270 }, { "epoch": 0.5877638258081752, "grad_norm": 19.306011967200188, "learning_rate": 4.328833670911724e-07, "logits/chosen": -0.8774306178092957, "logits/rejected": -0.841380774974823, "logps/chosen": -0.977648138999939, "logps/rejected": -1.1523730754852295, "loss": 1.4355, "rewards/accuracies": 0.5625, "rewards/chosen": -2.444120168685913, "rewards/margins": 0.43681272864341736, "rewards/rejected": -2.8809328079223633, "step": 275 }, { "epoch": 0.5984504408228694, "grad_norm": 18.830991297114817, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.0766229629516602, "logits/rejected": -1.034156084060669, "logps/chosen": -0.9281458854675293, "logps/rejected": -1.0838878154754639, "loss": 1.4465, "rewards/accuracies": 0.625, "rewards/chosen": -2.3203647136688232, "rewards/margins": 0.38935500383377075, "rewards/rejected": -2.709719657897949, "step": 280 }, { "epoch": 0.6091370558375635, "grad_norm": 20.938307778539542, "learning_rate": 3.960441545911204e-07, "logits/chosen": -0.9466499090194702, "logits/rejected": -0.9020398855209351, "logps/chosen": -1.1035289764404297, "logps/rejected": -1.1428934335708618, "loss": 1.4739, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.758822441101074, "rewards/margins": 0.09841099381446838, "rewards/rejected": -2.8572330474853516, "step": 285 }, { "epoch": 0.6198236708522575, "grad_norm": 23.245653263780497, "learning_rate": 3.778297969310529e-07, "logits/chosen": -0.9969805479049683, "logits/rejected": -0.9768760800361633, "logps/chosen": -0.9135320782661438, "logps/rejected": -1.1653510332107544, "loss": 1.3943, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.283830165863037, "rewards/margins": 0.6295474767684937, "rewards/rejected": -2.9133777618408203, "step": 290 }, { "epoch": 0.6305102858669517, "grad_norm": 18.66753481518412, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.0683342218399048, "logits/rejected": -1.0506360530853271, "logps/chosen": -1.0534954071044922, "logps/rejected": -1.2509009838104248, "loss": 1.4446, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.6337387561798096, "rewards/margins": 0.4935137629508972, "rewards/rejected": -3.1272525787353516, "step": 295 }, { "epoch": 0.6411969008816457, "grad_norm": 21.410873330760186, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -0.943785548210144, "logits/rejected": -0.8960458040237427, "logps/chosen": -0.9090589284896851, "logps/rejected": -1.2448195219039917, "loss": 1.4048, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2726473808288574, "rewards/margins": 0.8394016027450562, "rewards/rejected": -3.112048625946045, "step": 300 }, { "epoch": 0.6518835158963399, "grad_norm": 16.99902973812307, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.001460075378418, "logits/rejected": -0.9785451889038086, "logps/chosen": -1.0239307880401611, "logps/rejected": -1.286873698234558, "loss": 1.4219, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5598270893096924, "rewards/margins": 0.6573570370674133, "rewards/rejected": -3.217184543609619, "step": 305 }, { "epoch": 0.6625701309110339, "grad_norm": 35.891043777341494, "learning_rate": 3.069319753571269e-07, "logits/chosen": -0.9791940450668335, "logits/rejected": -1.024137258529663, "logps/chosen": -1.0476067066192627, "logps/rejected": -1.3181588649749756, "loss": 1.4695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.619016170501709, "rewards/margins": 0.6763805747032166, "rewards/rejected": -3.2953972816467285, "step": 310 }, { "epoch": 0.673256745925728, "grad_norm": 22.292608497489727, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.1296494007110596, "logits/rejected": -1.086306095123291, "logps/chosen": -1.0372415781021118, "logps/rejected": -1.1678041219711304, "loss": 1.469, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.593104124069214, "rewards/margins": 0.3264063894748688, "rewards/rejected": -2.9195103645324707, "step": 315 }, { "epoch": 0.6839433609404221, "grad_norm": 24.37127385244042, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.0386595726013184, "logits/rejected": -1.019863486289978, "logps/chosen": -0.9849531054496765, "logps/rejected": -1.240861177444458, "loss": 1.4399, "rewards/accuracies": 0.625, "rewards/chosen": -2.4623827934265137, "rewards/margins": 0.6397703289985657, "rewards/rejected": -3.1021530628204346, "step": 320 }, { "epoch": 0.6946299759551162, "grad_norm": 28.3616490001571, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -0.9190061688423157, "logits/rejected": -0.9376241564750671, "logps/chosen": -1.0179104804992676, "logps/rejected": -1.1567823886871338, "loss": 1.3944, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.544776439666748, "rewards/margins": 0.3471793532371521, "rewards/rejected": -2.891955614089966, "step": 325 }, { "epoch": 0.7053165909698104, "grad_norm": 23.089017555469844, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.0184417963027954, "logits/rejected": -0.9639641642570496, "logps/chosen": -1.065198540687561, "logps/rejected": -1.4555224180221558, "loss": 1.4179, "rewards/accuracies": 0.6875, "rewards/chosen": -2.6629960536956787, "rewards/margins": 0.9758095741271973, "rewards/rejected": -3.638806104660034, "step": 330 }, { "epoch": 0.7160032059845044, "grad_norm": 19.01160663909916, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -0.9782741665840149, "logits/rejected": -1.0168932676315308, "logps/chosen": -1.1860311031341553, "logps/rejected": -1.3966830968856812, "loss": 1.4107, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.965078115463257, "rewards/margins": 0.5266298651695251, "rewards/rejected": -3.4917080402374268, "step": 335 }, { "epoch": 0.7266898209991985, "grad_norm": 24.023962985538827, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -0.9762552380561829, "logits/rejected": -0.9652606248855591, "logps/chosen": -1.0335513353347778, "logps/rejected": -1.3794705867767334, "loss": 1.4363, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.5838780403137207, "rewards/margins": 0.8647986650466919, "rewards/rejected": -3.448676586151123, "step": 340 }, { "epoch": 0.7373764360138926, "grad_norm": 24.338376358621, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -0.9369735717773438, "logits/rejected": -0.9039338827133179, "logps/chosen": -1.2081493139266968, "logps/rejected": -1.3959646224975586, "loss": 1.3412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.020373821258545, "rewards/margins": 0.4695381224155426, "rewards/rejected": -3.4899115562438965, "step": 345 }, { "epoch": 0.7480630510285867, "grad_norm": 25.7573169489427, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.0008254051208496, "logits/rejected": -1.0084810256958008, "logps/chosen": -1.048879861831665, "logps/rejected": -1.3672925233840942, "loss": 1.3953, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.622199773788452, "rewards/margins": 0.7960314750671387, "rewards/rejected": -3.418231248855591, "step": 350 }, { "epoch": 0.7587496660432808, "grad_norm": 26.96889041092714, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.0293567180633545, "logits/rejected": -0.9677215814590454, "logps/chosen": -1.0719913244247437, "logps/rejected": -1.2735936641693115, "loss": 1.3954, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.679978370666504, "rewards/margins": 0.5040060877799988, "rewards/rejected": -3.1839847564697266, "step": 355 }, { "epoch": 0.7694362810579749, "grad_norm": 21.661999756036945, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -0.9887057542800903, "logits/rejected": -1.0177589654922485, "logps/chosen": -1.291603922843933, "logps/rejected": -1.665748953819275, "loss": 1.3804, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.2290101051330566, "rewards/margins": 0.9353626370429993, "rewards/rejected": -4.164372444152832, "step": 360 }, { "epoch": 0.7801228960726689, "grad_norm": 21.098443523944745, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -0.9724413752555847, "logits/rejected": -0.9351035952568054, "logps/chosen": -1.203754186630249, "logps/rejected": -1.3952935934066772, "loss": 1.3868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.009385347366333, "rewards/margins": 0.4788486063480377, "rewards/rejected": -3.488234043121338, "step": 365 }, { "epoch": 0.7908095110873631, "grad_norm": 25.021776262301575, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.0209473371505737, "logits/rejected": -1.0015369653701782, "logps/chosen": -1.2007997035980225, "logps/rejected": -1.3749370574951172, "loss": 1.3739, "rewards/accuracies": 0.625, "rewards/chosen": -3.0019993782043457, "rewards/margins": 0.43534326553344727, "rewards/rejected": -3.437342405319214, "step": 370 }, { "epoch": 0.8014961261020572, "grad_norm": 25.159927366829866, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.0015761852264404, "logits/rejected": -1.0289709568023682, "logps/chosen": -1.0982341766357422, "logps/rejected": -1.3828232288360596, "loss": 1.4606, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.7455852031707764, "rewards/margins": 0.7114725708961487, "rewards/rejected": -3.4570579528808594, "step": 375 }, { "epoch": 0.8121827411167513, "grad_norm": 20.84984615780728, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.087043046951294, "logits/rejected": -1.0346585512161255, "logps/chosen": -1.2293280363082886, "logps/rejected": -1.4728463888168335, "loss": 1.3777, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.073319911956787, "rewards/margins": 0.6087957620620728, "rewards/rejected": -3.6821160316467285, "step": 380 }, { "epoch": 0.8228693561314454, "grad_norm": 22.784469290812634, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.0404977798461914, "logits/rejected": -1.0313512086868286, "logps/chosen": -1.2078773975372314, "logps/rejected": -1.5215582847595215, "loss": 1.3372, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.019693613052368, "rewards/margins": 0.7842024564743042, "rewards/rejected": -3.803895950317383, "step": 385 }, { "epoch": 0.8335559711461394, "grad_norm": 38.2013964652558, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.0984976291656494, "logits/rejected": -1.0727328062057495, "logps/chosen": -1.2440178394317627, "logps/rejected": -1.535946011543274, "loss": 1.3633, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.110044479370117, "rewards/margins": 0.7298205494880676, "rewards/rejected": -3.839865207672119, "step": 390 }, { "epoch": 0.8442425861608336, "grad_norm": 21.29430884924605, "learning_rate": 7.077560319906694e-08, "logits/chosen": -0.9021504521369934, "logits/rejected": -0.817115306854248, "logps/chosen": -1.1583576202392578, "logps/rejected": -1.3431838750839233, "loss": 1.396, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.8958938121795654, "rewards/margins": 0.46206584572792053, "rewards/rejected": -3.357959747314453, "step": 395 }, { "epoch": 0.8549292011755276, "grad_norm": 18.542727738232298, "learning_rate": 6.148679950161672e-08, "logits/chosen": -0.9785951375961304, "logits/rejected": -0.9371121525764465, "logps/chosen": -1.3020581007003784, "logps/rejected": -1.5772645473480225, "loss": 1.3975, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.255145311355591, "rewards/margins": 0.6880159974098206, "rewards/rejected": -3.9431610107421875, "step": 400 }, { "epoch": 0.8549292011755276, "eval_logits/chosen": -1.1872848272323608, "eval_logits/rejected": -1.1544642448425293, "eval_logps/chosen": -1.1779358386993408, "eval_logps/rejected": -1.4415003061294556, "eval_loss": 1.375516653060913, "eval_rewards/accuracies": 0.6612903475761414, "eval_rewards/chosen": -2.9448394775390625, "eval_rewards/margins": 0.6589111685752869, "eval_rewards/rejected": -3.603750705718994, "eval_runtime": 72.7978, "eval_samples_per_second": 26.938, "eval_steps_per_second": 0.852, "step": 400 }, { "epoch": 0.8656158161902218, "grad_norm": 23.254725123506937, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -0.9737190008163452, "logits/rejected": -0.8999165296554565, "logps/chosen": -1.2110763788223267, "logps/rejected": -1.4770066738128662, "loss": 1.4197, "rewards/accuracies": 0.6875, "rewards/chosen": -3.027691125869751, "rewards/margins": 0.6648265719413757, "rewards/rejected": -3.6925175189971924, "step": 405 }, { "epoch": 0.8763024312049158, "grad_norm": 31.565832295867562, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -0.9489310383796692, "logits/rejected": -0.9287575483322144, "logps/chosen": -1.2163779735565186, "logps/rejected": -1.4926642179489136, "loss": 1.3848, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.040944814682007, "rewards/margins": 0.6907154321670532, "rewards/rejected": -3.7316603660583496, "step": 410 }, { "epoch": 0.88698904621961, "grad_norm": 22.344503337533215, "learning_rate": 3.734784976300165e-08, "logits/chosen": -0.92424076795578, "logits/rejected": -0.913791298866272, "logps/chosen": -1.1991280317306519, "logps/rejected": -1.3796842098236084, "loss": 1.3718, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.9978199005126953, "rewards/margins": 0.45139074325561523, "rewards/rejected": -3.4492106437683105, "step": 415 }, { "epoch": 0.897675661234304, "grad_norm": 25.79666616123638, "learning_rate": 3.058153372200695e-08, "logits/chosen": -0.9958304166793823, "logits/rejected": -0.9945958256721497, "logps/chosen": -1.3022658824920654, "logps/rejected": -1.4703620672225952, "loss": 1.4021, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.255664348602295, "rewards/margins": 0.4202408194541931, "rewards/rejected": -3.675905704498291, "step": 420 }, { "epoch": 0.9083622762489981, "grad_norm": 24.252773163543072, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -0.9615448713302612, "logits/rejected": -0.9239951968193054, "logps/chosen": -1.2393920421600342, "logps/rejected": -1.5796483755111694, "loss": 1.3444, "rewards/accuracies": 0.625, "rewards/chosen": -3.098480224609375, "rewards/margins": 0.8506406545639038, "rewards/rejected": -3.9491209983825684, "step": 425 }, { "epoch": 0.9190488912636923, "grad_norm": 25.840588676616843, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -0.9318382143974304, "logits/rejected": -0.895746111869812, "logps/chosen": -1.19111168384552, "logps/rejected": -1.4055861234664917, "loss": 1.3383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.9777793884277344, "rewards/margins": 0.536186158657074, "rewards/rejected": -3.513965606689453, "step": 430 }, { "epoch": 0.9297355062783863, "grad_norm": 24.38677858349731, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -0.9664777517318726, "logits/rejected": -0.9452959299087524, "logps/chosen": -1.1789872646331787, "logps/rejected": -1.4990911483764648, "loss": 1.3959, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9474682807922363, "rewards/margins": 0.8002597093582153, "rewards/rejected": -3.747727870941162, "step": 435 }, { "epoch": 0.9404221212930804, "grad_norm": 20.846346077184034, "learning_rate": 1.016230078838226e-08, "logits/chosen": -0.9898012280464172, "logits/rejected": -0.9755558967590332, "logps/chosen": -1.205000638961792, "logps/rejected": -1.4142088890075684, "loss": 1.3646, "rewards/accuracies": 0.625, "rewards/chosen": -3.0125012397766113, "rewards/margins": 0.5230205059051514, "rewards/rejected": -3.535521984100342, "step": 440 }, { "epoch": 0.9511087363077745, "grad_norm": 25.087392613196315, "learning_rate": 6.754703038239329e-09, "logits/chosen": -0.9521238207817078, "logits/rejected": -0.9080606698989868, "logps/chosen": -1.1550867557525635, "logps/rejected": -1.3055427074432373, "loss": 1.3825, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.887716770172119, "rewards/margins": 0.3761400282382965, "rewards/rejected": -3.263856887817383, "step": 445 }, { "epoch": 0.9617953513224686, "grad_norm": 24.725896653899223, "learning_rate": 4.036953436716895e-09, "logits/chosen": -0.9332659840583801, "logits/rejected": -0.9464728236198425, "logps/chosen": -1.2779486179351807, "logps/rejected": -1.5308974981307983, "loss": 1.3408, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.194871425628662, "rewards/margins": 0.6323727369308472, "rewards/rejected": -3.8272438049316406, "step": 450 }, { "epoch": 0.9724819663371627, "grad_norm": 28.975077371611935, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -0.9732203483581543, "logits/rejected": -0.9457721710205078, "logps/chosen": -1.345735788345337, "logps/rejected": -1.4330954551696777, "loss": 1.4589, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.3643393516540527, "rewards/margins": 0.21839912235736847, "rewards/rejected": -3.5827383995056152, "step": 455 }, { "epoch": 0.9831685813518568, "grad_norm": 26.785266092328502, "learning_rate": 6.852326227130833e-10, "logits/chosen": -0.9476199150085449, "logits/rejected": -0.9895550608634949, "logps/chosen": -1.4385395050048828, "logps/rejected": -1.8012183904647827, "loss": 1.3759, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.596348524093628, "rewards/margins": 0.906697154045105, "rewards/rejected": -4.503045558929443, "step": 460 }, { "epoch": 0.9938551963665508, "grad_norm": 25.28595563871724, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.0385969877243042, "logits/rejected": -0.9505317807197571, "logps/chosen": -1.1505335569381714, "logps/rejected": -1.4205496311187744, "loss": 1.3506, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.8763339519500732, "rewards/margins": 0.6750401258468628, "rewards/rejected": -3.5513739585876465, "step": 465 }, { "epoch": 0.9981298423724285, "step": 467, "total_flos": 0.0, "train_loss": 1.4758259897824273, "train_runtime": 7808.6202, "train_samples_per_second": 7.668, "train_steps_per_second": 0.06 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }