diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9786856127886323, + "eval_steps": 50.0, + "global_step": 560, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 281.90715152422604, + "learning_rate": 5e-08, + "logits/chosen": -2.6629433631896973, + "logits/rejected": -2.567349433898926, + "logps/chosen": -198.9954833984375, + "logps/rejected": -227.59715270996094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 242.115951433827, + "learning_rate": 1e-07, + "logits/chosen": -2.8272287845611572, + "logits/rejected": -2.625117540359497, + "logps/chosen": -261.0748291015625, + "logps/rejected": -277.235107421875, + "loss": 0.5884, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0696694403886795, + "rewards/margins": 0.24324257671833038, + "rewards/rejected": -0.17357313632965088, + "step": 2 + }, + { + "epoch": 0.02, + "grad_norm": 199.0452683748491, + "learning_rate": 1.5e-07, + "logits/chosen": -2.730591297149658, + "logits/rejected": -2.6427087783813477, + "logps/chosen": -190.12083435058594, + "logps/rejected": -260.2308044433594, + "loss": 0.3964, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18776875734329224, + "rewards/margins": 0.705060601234436, + "rewards/rejected": -0.5172918438911438, + "step": 3 + }, + { + "epoch": 0.03, + "grad_norm": 140.17062357724015, + "learning_rate": 2e-07, + "logits/chosen": -2.700812816619873, + "logits/rejected": -2.6588032245635986, + "logps/chosen": -192.72433471679688, + "logps/rejected": -274.5499572753906, + "loss": 0.3091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25030839443206787, + "rewards/margins": 1.1928011178970337, + "rewards/rejected": -0.9424927830696106, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 87.30006666351441, + "learning_rate": 2.5e-07, + "logits/chosen": -2.6981821060180664, + "logits/rejected": -2.558434009552002, + "logps/chosen": -191.63417053222656, + "logps/rejected": -214.12445068359375, + "loss": 0.1621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4374893307685852, + "rewards/margins": 2.4500865936279297, + "rewards/rejected": -2.0125973224639893, + "step": 5 + }, + { + "epoch": 0.04, + "grad_norm": 39.988731957951615, + "learning_rate": 3e-07, + "logits/chosen": -2.679320812225342, + "logits/rejected": -2.608996868133545, + "logps/chosen": -178.1643524169922, + "logps/rejected": -258.6800231933594, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6403220891952515, + "rewards/margins": 3.48699951171875, + "rewards/rejected": -2.84667706489563, + "step": 6 + }, + { + "epoch": 0.05, + "grad_norm": 33.310634028311206, + "learning_rate": 3.5e-07, + "logits/chosen": -2.6411356925964355, + "logits/rejected": -2.6490371227264404, + "logps/chosen": -154.45066833496094, + "logps/rejected": -323.791748046875, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8468711376190186, + "rewards/margins": 5.800564289093018, + "rewards/rejected": -4.953693389892578, + "step": 7 + }, + { + "epoch": 0.06, + "grad_norm": 23.883973264297232, + "learning_rate": 4e-07, + "logits/chosen": -2.6611804962158203, + "logits/rejected": -2.631485939025879, + "logps/chosen": -123.65115356445312, + "logps/rejected": -229.07261657714844, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4461992383003235, + "rewards/margins": 4.723011493682861, + "rewards/rejected": -4.276812553405762, + "step": 8 + }, + { + "epoch": 0.06, + "grad_norm": 22.535417111915557, + "learning_rate": 4.5e-07, + "logits/chosen": -2.7018516063690186, + "logits/rejected": -2.576171398162842, + "logps/chosen": -214.6218719482422, + "logps/rejected": -288.8828430175781, + "loss": 0.0344, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4354260563850403, + "rewards/margins": 5.6106157302856445, + "rewards/rejected": -5.175189971923828, + "step": 9 + }, + { + "epoch": 0.07, + "grad_norm": 29.594161907671293, + "learning_rate": 5e-07, + "logits/chosen": -2.6480002403259277, + "logits/rejected": -2.595672369003296, + "logps/chosen": -133.1983184814453, + "logps/rejected": -292.673583984375, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0619034618139267, + "rewards/margins": 8.280220985412598, + "rewards/rejected": -8.218317985534668, + "step": 10 + }, + { + "epoch": 0.08, + "grad_norm": 6.546818349184967, + "learning_rate": 4.999959216621625e-07, + "logits/chosen": -2.6915440559387207, + "logits/rejected": -2.601680278778076, + "logps/chosen": -208.8468475341797, + "logps/rejected": -357.8781433105469, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36291420459747314, + "rewards/margins": 12.313801765441895, + "rewards/rejected": -11.950888633728027, + "step": 11 + }, + { + "epoch": 0.09, + "grad_norm": 2.711195693631395, + "learning_rate": 4.999836867817129e-07, + "logits/chosen": -2.7125449180603027, + "logits/rejected": -2.6277506351470947, + "logps/chosen": -193.25125122070312, + "logps/rejected": -377.0108337402344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19406113028526306, + "rewards/margins": 13.71890640258789, + "rewards/rejected": -13.5248441696167, + "step": 12 + }, + { + "epoch": 0.09, + "grad_norm": 47.98611653944905, + "learning_rate": 4.999632957578348e-07, + "logits/chosen": -2.713609218597412, + "logits/rejected": -2.5939130783081055, + "logps/chosen": -236.6820068359375, + "logps/rejected": -416.9261474609375, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5520939230918884, + "rewards/margins": 14.957647323608398, + "rewards/rejected": -14.40555191040039, + "step": 13 + }, + { + "epoch": 0.1, + "grad_norm": 1.0738890125654583, + "learning_rate": 4.999347492558202e-07, + "logits/chosen": -2.7518250942230225, + "logits/rejected": -2.6807100772857666, + "logps/chosen": -214.81539916992188, + "logps/rejected": -419.8458251953125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7233214974403381, + "rewards/margins": 16.712013244628906, + "rewards/rejected": -15.988694190979004, + "step": 14 + }, + { + "epoch": 0.11, + "grad_norm": 1.505052280526844, + "learning_rate": 4.998980482070472e-07, + "logits/chosen": -2.6702589988708496, + "logits/rejected": -2.654355049133301, + "logps/chosen": -155.551513671875, + "logps/rejected": -408.07562255859375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27805235981941223, + "rewards/margins": 16.221580505371094, + "rewards/rejected": -16.4996337890625, + "step": 15 + }, + { + "epoch": 0.11, + "grad_norm": 1.0543484170906776, + "learning_rate": 4.998531938089504e-07, + "logits/chosen": -2.6782705783843994, + "logits/rejected": -2.6008286476135254, + "logps/chosen": -221.91819763183594, + "logps/rejected": -455.9557800292969, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3678271770477295, + "rewards/margins": 21.218441009521484, + "rewards/rejected": -20.85061264038086, + "step": 16 + }, + { + "epoch": 0.12, + "grad_norm": 0.5260502592201036, + "learning_rate": 4.998001875249803e-07, + "logits/chosen": -2.6398086547851562, + "logits/rejected": -2.5224509239196777, + "logps/chosen": -215.98538208007812, + "logps/rejected": -389.3098449707031, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7489739656448364, + "rewards/margins": 17.479093551635742, + "rewards/rejected": -16.730117797851562, + "step": 17 + }, + { + "epoch": 0.13, + "grad_norm": 66.17444082685097, + "learning_rate": 4.997390310845577e-07, + "logits/chosen": -2.743476629257202, + "logits/rejected": -2.640836715698242, + "logps/chosen": -277.9833984375, + "logps/rejected": -529.0450439453125, + "loss": 0.0225, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1146619319915771, + "rewards/margins": 20.89437484741211, + "rewards/rejected": -22.0090389251709, + "step": 18 + }, + { + "epoch": 0.13, + "grad_norm": 29.930259298401037, + "learning_rate": 4.996697264830153e-07, + "logits/chosen": -2.6333608627319336, + "logits/rejected": -2.5673627853393555, + "logps/chosen": -192.6189422607422, + "logps/rejected": -393.21612548828125, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42793774604797363, + "rewards/margins": 18.234699249267578, + "rewards/rejected": -18.662635803222656, + "step": 19 + }, + { + "epoch": 0.14, + "grad_norm": 54.78862237470746, + "learning_rate": 4.995922759815338e-07, + "logits/chosen": -2.5866482257843018, + "logits/rejected": -2.566714286804199, + "logps/chosen": -161.02191162109375, + "logps/rejected": -427.61688232421875, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2347876876592636, + "rewards/margins": 23.002973556518555, + "rewards/rejected": -23.237762451171875, + "step": 20 + }, + { + "epoch": 0.15, + "grad_norm": 0.9426373656743386, + "learning_rate": 4.995066821070679e-07, + "logits/chosen": -2.632108688354492, + "logits/rejected": -2.5998525619506836, + "logps/chosen": -215.82980346679688, + "logps/rejected": -514.6175537109375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.015092372894287, + "rewards/margins": 23.756736755371094, + "rewards/rejected": -24.77182960510254, + "step": 21 + }, + { + "epoch": 0.16, + "grad_norm": 0.11567822367764634, + "learning_rate": 4.994129476522631e-07, + "logits/chosen": -2.704789638519287, + "logits/rejected": -2.6267013549804688, + "logps/chosen": -214.1823272705078, + "logps/rejected": -486.7920837402344, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042057812213897705, + "rewards/margins": 26.20745849609375, + "rewards/rejected": -26.249515533447266, + "step": 22 + }, + { + "epoch": 0.16, + "grad_norm": 34.11356148725027, + "learning_rate": 4.993110756753659e-07, + "logits/chosen": -2.753283977508545, + "logits/rejected": -2.619767427444458, + "logps/chosen": -261.2287902832031, + "logps/rejected": -535.1788330078125, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.227592945098877, + "rewards/margins": 25.549057006835938, + "rewards/rejected": -26.776649475097656, + "step": 23 + }, + { + "epoch": 0.17, + "grad_norm": 0.09155594869068727, + "learning_rate": 4.992010695001229e-07, + "logits/chosen": -2.6895101070404053, + "logits/rejected": -2.644169330596924, + "logps/chosen": -230.61830139160156, + "logps/rejected": -535.4132080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15794718265533447, + "rewards/margins": 26.002853393554688, + "rewards/rejected": -25.844905853271484, + "step": 24 + }, + { + "epoch": 0.18, + "grad_norm": 0.4818665205812188, + "learning_rate": 4.990829327156728e-07, + "logits/chosen": -2.618314027786255, + "logits/rejected": -2.602321147918701, + "logps/chosen": -232.47010803222656, + "logps/rejected": -528.847412109375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40067100524902344, + "rewards/margins": 21.66196060180664, + "rewards/rejected": -22.062631607055664, + "step": 25 + }, + { + "epoch": 0.18, + "grad_norm": 3.327311328513334, + "learning_rate": 4.989566691764295e-07, + "logits/chosen": -2.6846771240234375, + "logits/rejected": -2.5660243034362793, + "logps/chosen": -226.3593292236328, + "logps/rejected": -526.5007934570312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38518890738487244, + "rewards/margins": 26.244380950927734, + "rewards/rejected": -25.859193801879883, + "step": 26 + }, + { + "epoch": 0.19, + "grad_norm": 1.314446392440851, + "learning_rate": 4.988222830019558e-07, + "logits/chosen": -2.669529676437378, + "logits/rejected": -2.6071646213531494, + "logps/chosen": -177.98822021484375, + "logps/rejected": -392.99609375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02924838662147522, + "rewards/margins": 19.898130416870117, + "rewards/rejected": -19.868881225585938, + "step": 27 + }, + { + "epoch": 0.2, + "grad_norm": 0.2381953252008019, + "learning_rate": 4.986797785768295e-07, + "logits/chosen": -2.6734800338745117, + "logits/rejected": -2.6391868591308594, + "logps/chosen": -158.04425048828125, + "logps/rejected": -459.2864685058594, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7002713680267334, + "rewards/margins": 23.20564842224121, + "rewards/rejected": -23.905920028686523, + "step": 28 + }, + { + "epoch": 0.21, + "grad_norm": 0.10043170165244379, + "learning_rate": 4.985291605505003e-07, + "logits/chosen": -2.6339831352233887, + "logits/rejected": -2.5245838165283203, + "logps/chosen": -209.26747131347656, + "logps/rejected": -439.3158264160156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23426340520381927, + "rewards/margins": 23.022098541259766, + "rewards/rejected": -22.78783416748047, + "step": 29 + }, + { + "epoch": 0.21, + "grad_norm": 0.1418548745490386, + "learning_rate": 4.983704338371376e-07, + "logits/chosen": -2.629924774169922, + "logits/rejected": -2.5544097423553467, + "logps/chosen": -170.83941650390625, + "logps/rejected": -415.26068115234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8738999366760254, + "rewards/margins": 19.754131317138672, + "rewards/rejected": -20.62803077697754, + "step": 30 + }, + { + "epoch": 0.22, + "grad_norm": 2.17076092159296, + "learning_rate": 4.982036036154705e-07, + "logits/chosen": -2.741410970687866, + "logits/rejected": -2.6220104694366455, + "logps/chosen": -292.33453369140625, + "logps/rejected": -577.2335205078125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7067211866378784, + "rewards/margins": 27.69849395751953, + "rewards/rejected": -28.40521240234375, + "step": 31 + }, + { + "epoch": 0.23, + "grad_norm": 0.10228944961726326, + "learning_rate": 4.980286753286194e-07, + "logits/chosen": -2.6666581630706787, + "logits/rejected": -2.5948023796081543, + "logps/chosen": -187.42221069335938, + "logps/rejected": -464.3155822753906, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4131588637828827, + "rewards/margins": 24.860668182373047, + "rewards/rejected": -25.273828506469727, + "step": 32 + }, + { + "epoch": 0.23, + "grad_norm": 1.6392740975414573, + "learning_rate": 4.978456546839174e-07, + "logits/chosen": -2.6635849475860596, + "logits/rejected": -2.65657901763916, + "logps/chosen": -203.83349609375, + "logps/rejected": -525.3914794921875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6813369393348694, + "rewards/margins": 25.756736755371094, + "rewards/rejected": -26.438074111938477, + "step": 33 + }, + { + "epoch": 0.24, + "grad_norm": 0.003035026474907154, + "learning_rate": 4.976545476527245e-07, + "logits/chosen": -2.543832302093506, + "logits/rejected": -2.5536553859710693, + "logps/chosen": -164.94317626953125, + "logps/rejected": -469.80242919921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2642908096313477, + "rewards/margins": 24.309701919555664, + "rewards/rejected": -26.573991775512695, + "step": 34 + }, + { + "epoch": 0.25, + "grad_norm": 0.037062331048007276, + "learning_rate": 4.974553604702332e-07, + "logits/chosen": -2.714449405670166, + "logits/rejected": -2.6020994186401367, + "logps/chosen": -227.64306640625, + "logps/rejected": -547.5789184570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5523563623428345, + "rewards/margins": 29.138961791992188, + "rewards/rejected": -29.69131851196289, + "step": 35 + }, + { + "epoch": 0.26, + "grad_norm": 0.014866592518054757, + "learning_rate": 4.972480996352643e-07, + "logits/chosen": -2.669438600540161, + "logits/rejected": -2.611865520477295, + "logps/chosen": -249.58926391601562, + "logps/rejected": -503.8349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9827100038528442, + "rewards/margins": 22.718931198120117, + "rewards/rejected": -24.701641082763672, + "step": 36 + }, + { + "epoch": 0.26, + "grad_norm": 11.116295076166882, + "learning_rate": 4.970327719100555e-07, + "logits/chosen": -2.6107139587402344, + "logits/rejected": -2.614323139190674, + "logps/chosen": -255.21981811523438, + "logps/rejected": -576.1066284179688, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4675748348236084, + "rewards/margins": 29.554473876953125, + "rewards/rejected": -32.02204895019531, + "step": 37 + }, + { + "epoch": 0.27, + "grad_norm": 0.01145820312538276, + "learning_rate": 4.968093843200407e-07, + "logits/chosen": -2.6186201572418213, + "logits/rejected": -2.6459224224090576, + "logps/chosen": -188.0570068359375, + "logps/rejected": -499.5050964355469, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5796831846237183, + "rewards/margins": 26.261127471923828, + "rewards/rejected": -27.840808868408203, + "step": 38 + }, + { + "epoch": 0.28, + "grad_norm": 1.1930790483998293, + "learning_rate": 4.965779441536201e-07, + "logits/chosen": -2.7035415172576904, + "logits/rejected": -2.6863503456115723, + "logps/chosen": -217.47967529296875, + "logps/rejected": -517.9664916992188, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0054142475128174, + "rewards/margins": 24.799697875976562, + "rewards/rejected": -27.805110931396484, + "step": 39 + }, + { + "epoch": 0.28, + "grad_norm": 4.13984571001177, + "learning_rate": 4.963384589619232e-07, + "logits/chosen": -2.6965861320495605, + "logits/rejected": -2.6189560890197754, + "logps/chosen": -256.41497802734375, + "logps/rejected": -601.5928955078125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7531956434249878, + "rewards/margins": 31.01791000366211, + "rewards/rejected": -32.77110290527344, + "step": 40 + }, + { + "epoch": 0.29, + "grad_norm": 0.006769514002002826, + "learning_rate": 4.960909365585624e-07, + "logits/chosen": -2.6950056552886963, + "logits/rejected": -2.675766706466675, + "logps/chosen": -220.96633911132812, + "logps/rejected": -574.105712890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0254430770874023, + "rewards/margins": 30.308761596679688, + "rewards/rejected": -32.334205627441406, + "step": 41 + }, + { + "epoch": 0.3, + "grad_norm": 4.785442238519398, + "learning_rate": 4.958353850193773e-07, + "logits/chosen": -2.7881109714508057, + "logits/rejected": -2.633843421936035, + "logps/chosen": -306.7889099121094, + "logps/rejected": -600.2064208984375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7407360076904297, + "rewards/margins": 30.786224365234375, + "rewards/rejected": -33.52696228027344, + "step": 42 + }, + { + "epoch": 0.31, + "grad_norm": 0.710936296869806, + "learning_rate": 4.955718126821722e-07, + "logits/chosen": -2.6794488430023193, + "logits/rejected": -2.589600086212158, + "logps/chosen": -249.20724487304688, + "logps/rejected": -485.8760070800781, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4175808429718018, + "rewards/margins": 25.45222282409668, + "rewards/rejected": -27.869800567626953, + "step": 43 + }, + { + "epoch": 0.31, + "grad_norm": 0.14331868857928534, + "learning_rate": 4.953002281464431e-07, + "logits/chosen": -2.693408489227295, + "logits/rejected": -2.667663335800171, + "logps/chosen": -249.45562744140625, + "logps/rejected": -653.1517333984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8556472063064575, + "rewards/margins": 34.33069610595703, + "rewards/rejected": -36.186344146728516, + "step": 44 + }, + { + "epoch": 0.32, + "grad_norm": 0.9948880729086409, + "learning_rate": 4.950206402730983e-07, + "logits/chosen": -2.71539568901062, + "logits/rejected": -2.6522209644317627, + "logps/chosen": -273.3675231933594, + "logps/rejected": -602.6591796875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3636507987976074, + "rewards/margins": 28.48233413696289, + "rewards/rejected": -31.845983505249023, + "step": 45 + }, + { + "epoch": 0.33, + "grad_norm": 0.009408795199841412, + "learning_rate": 4.94733058184168e-07, + "logits/chosen": -2.7508957386016846, + "logits/rejected": -2.6326212882995605, + "logps/chosen": -262.9375915527344, + "logps/rejected": -620.339111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23247653245925903, + "rewards/margins": 35.838478088378906, + "rewards/rejected": -36.070953369140625, + "step": 46 + }, + { + "epoch": 0.33, + "grad_norm": 0.37450245846347946, + "learning_rate": 4.944374912625075e-07, + "logits/chosen": -2.717337131500244, + "logits/rejected": -2.607605457305908, + "logps/chosen": -248.4813232421875, + "logps/rejected": -533.3270263671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5064290761947632, + "rewards/margins": 30.289798736572266, + "rewards/rejected": -31.796226501464844, + "step": 47 + }, + { + "epoch": 0.34, + "grad_norm": 0.1835344974224305, + "learning_rate": 4.941339491514909e-07, + "logits/chosen": -2.6347856521606445, + "logits/rejected": -2.5980584621429443, + "logps/chosen": -234.634765625, + "logps/rejected": -520.9091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.370199680328369, + "rewards/margins": 24.74384307861328, + "rewards/rejected": -27.114044189453125, + "step": 48 + }, + { + "epoch": 0.35, + "grad_norm": 0.19644898924785964, + "learning_rate": 4.938224417546964e-07, + "logits/chosen": -2.6752350330352783, + "logits/rejected": -2.612856388092041, + "logps/chosen": -255.13584899902344, + "logps/rejected": -544.5784912109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.900641918182373, + "rewards/margins": 28.50861167907715, + "rewards/rejected": -31.40925407409668, + "step": 49 + }, + { + "epoch": 0.36, + "grad_norm": 12.667309452461893, + "learning_rate": 4.935029792355834e-07, + "logits/chosen": -2.6576719284057617, + "logits/rejected": -2.6111912727355957, + "logps/chosen": -226.2794952392578, + "logps/rejected": -544.1780395507812, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.08176589012146, + "rewards/margins": 26.937583923339844, + "rewards/rejected": -30.01934814453125, + "step": 50 + }, + { + "epoch": 0.36, + "grad_norm": 0.33442295848196923, + "learning_rate": 4.931755720171603e-07, + "logits/chosen": -2.711118221282959, + "logits/rejected": -2.621406078338623, + "logps/chosen": -247.03359985351562, + "logps/rejected": -543.7137451171875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4761788845062256, + "rewards/margins": 28.42976951599121, + "rewards/rejected": -30.90595054626465, + "step": 51 + }, + { + "epoch": 0.37, + "grad_norm": 0.002978605963135415, + "learning_rate": 4.928402307816451e-07, + "logits/chosen": -2.641918659210205, + "logits/rejected": -2.6261231899261475, + "logps/chosen": -282.81689453125, + "logps/rejected": -646.1288452148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9793777465820312, + "rewards/margins": 35.47356033325195, + "rewards/rejected": -38.452938079833984, + "step": 52 + }, + { + "epoch": 0.38, + "grad_norm": 0.007293530906843193, + "learning_rate": 4.924969664701168e-07, + "logits/chosen": -2.68820858001709, + "logits/rejected": -2.657637357711792, + "logps/chosen": -229.20904541015625, + "logps/rejected": -615.5895385742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1064367294311523, + "rewards/margins": 34.568870544433594, + "rewards/rejected": -36.67530822753906, + "step": 53 + }, + { + "epoch": 0.38, + "grad_norm": 0.07313564002307854, + "learning_rate": 4.921457902821578e-07, + "logits/chosen": -2.7021608352661133, + "logits/rejected": -2.704730272293091, + "logps/chosen": -204.37091064453125, + "logps/rejected": -673.6033935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7201132774353027, + "rewards/margins": 38.21685028076172, + "rewards/rejected": -40.93695831298828, + "step": 54 + }, + { + "epoch": 0.39, + "grad_norm": 0.9236766720206969, + "learning_rate": 4.917867136754893e-07, + "logits/chosen": -2.7190704345703125, + "logits/rejected": -2.5890862941741943, + "logps/chosen": -226.73704528808594, + "logps/rejected": -545.2869262695312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.233630895614624, + "rewards/margins": 29.354625701904297, + "rewards/rejected": -31.5882568359375, + "step": 55 + }, + { + "epoch": 0.4, + "grad_norm": 4.936663709813187, + "learning_rate": 4.914197483655969e-07, + "logits/chosen": -2.6672158241271973, + "logits/rejected": -2.623007297515869, + "logps/chosen": -231.91249084472656, + "logps/rejected": -575.4192504882812, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2700870633125305, + "rewards/margins": 29.446908950805664, + "rewards/rejected": -29.716995239257812, + "step": 56 + }, + { + "epoch": 0.4, + "grad_norm": 1.1440545165867206, + "learning_rate": 4.910449063253489e-07, + "logits/chosen": -2.6058759689331055, + "logits/rejected": -2.5625879764556885, + "logps/chosen": -198.53195190429688, + "logps/rejected": -516.7724609375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.585146903991699, + "rewards/margins": 26.276334762573242, + "rewards/rejected": -28.861480712890625, + "step": 57 + }, + { + "epoch": 0.41, + "grad_norm": 0.35055442807099685, + "learning_rate": 4.906621997846048e-07, + "logits/chosen": -2.560368537902832, + "logits/rejected": -2.552420139312744, + "logps/chosen": -216.03591918945312, + "logps/rejected": -537.11767578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9979751110076904, + "rewards/margins": 29.738142013549805, + "rewards/rejected": -32.736114501953125, + "step": 58 + }, + { + "epoch": 0.42, + "grad_norm": 13.981366742421361, + "learning_rate": 4.902716412298173e-07, + "logits/chosen": -2.636011838912964, + "logits/rejected": -2.6296021938323975, + "logps/chosen": -254.3116455078125, + "logps/rejected": -580.985107421875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.454509735107422, + "rewards/margins": 30.887685775756836, + "rewards/rejected": -33.342193603515625, + "step": 59 + }, + { + "epoch": 0.43, + "grad_norm": 0.05969154440241812, + "learning_rate": 4.898732434036243e-07, + "logits/chosen": -2.653785228729248, + "logits/rejected": -2.5550966262817383, + "logps/chosen": -249.27053833007812, + "logps/rejected": -623.03271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6164228916168213, + "rewards/margins": 37.17173385620117, + "rewards/rejected": -38.78815841674805, + "step": 60 + }, + { + "epoch": 0.43, + "grad_norm": 44.85756557932988, + "learning_rate": 4.894670193044331e-07, + "logits/chosen": -2.6769516468048096, + "logits/rejected": -2.5617403984069824, + "logps/chosen": -275.76226806640625, + "logps/rejected": -588.6322021484375, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7687606811523438, + "rewards/margins": 30.032920837402344, + "rewards/rejected": -33.80168151855469, + "step": 61 + }, + { + "epoch": 0.44, + "grad_norm": 25.05609929319295, + "learning_rate": 4.890529821859968e-07, + "logits/chosen": -2.698042869567871, + "logits/rejected": -2.6007933616638184, + "logps/chosen": -319.7588806152344, + "logps/rejected": -666.7498779296875, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4817442893981934, + "rewards/margins": 36.213741302490234, + "rewards/rejected": -39.69548416137695, + "step": 62 + }, + { + "epoch": 0.45, + "grad_norm": 0.00886287701258941, + "learning_rate": 4.88631145556981e-07, + "logits/chosen": -2.677527904510498, + "logits/rejected": -2.5850651264190674, + "logps/chosen": -288.19940185546875, + "logps/rejected": -634.798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.209935665130615, + "rewards/margins": 36.36002731323242, + "rewards/rejected": -40.56996154785156, + "step": 63 + }, + { + "epoch": 0.45, + "grad_norm": 0.000938886598454837, + "learning_rate": 4.882015231805244e-07, + "logits/chosen": -2.6223208904266357, + "logits/rejected": -2.6430504322052, + "logps/chosen": -231.6414794921875, + "logps/rejected": -655.280517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.702812433242798, + "rewards/margins": 33.715309143066406, + "rewards/rejected": -37.418121337890625, + "step": 64 + }, + { + "epoch": 0.46, + "grad_norm": 0.08822060400580437, + "learning_rate": 4.877641290737883e-07, + "logits/chosen": -2.6343486309051514, + "logits/rejected": -2.5857908725738525, + "logps/chosen": -267.01556396484375, + "logps/rejected": -658.9562377929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9423695802688599, + "rewards/margins": 36.631046295166016, + "rewards/rejected": -38.57341384887695, + "step": 65 + }, + { + "epoch": 0.47, + "grad_norm": 18.62758683373039, + "learning_rate": 4.873189775075004e-07, + "logits/chosen": -2.6101677417755127, + "logits/rejected": -2.5529539585113525, + "logps/chosen": -260.031494140625, + "logps/rejected": -535.39453125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.324441909790039, + "rewards/margins": 28.348983764648438, + "rewards/rejected": -32.673423767089844, + "step": 66 + }, + { + "epoch": 0.48, + "grad_norm": 0.10386658843461564, + "learning_rate": 4.868660830054883e-07, + "logits/chosen": -2.63973069190979, + "logits/rejected": -2.585453987121582, + "logps/chosen": -254.2733154296875, + "logps/rejected": -632.6726684570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6447956562042236, + "rewards/margins": 33.55132293701172, + "rewards/rejected": -35.19612121582031, + "step": 67 + }, + { + "epoch": 0.48, + "grad_norm": 0.0088867124233782, + "learning_rate": 4.864054603442063e-07, + "logits/chosen": -2.6842851638793945, + "logits/rejected": -2.566390037536621, + "logps/chosen": -267.3186950683594, + "logps/rejected": -598.654296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.59023118019104, + "rewards/margins": 30.732364654541016, + "rewards/rejected": -33.322593688964844, + "step": 68 + }, + { + "epoch": 0.49, + "grad_norm": 6.3031421726605625, + "learning_rate": 4.85937124552253e-07, + "logits/chosen": -2.602694511413574, + "logits/rejected": -2.6041197776794434, + "logps/chosen": -223.94290161132812, + "logps/rejected": -628.2261352539062, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.583258628845215, + "rewards/margins": 35.93033218383789, + "rewards/rejected": -38.513587951660156, + "step": 69 + }, + { + "epoch": 0.5, + "grad_norm": 0.05652992059387897, + "learning_rate": 4.854610909098811e-07, + "logits/chosen": -2.619680643081665, + "logits/rejected": -2.5189099311828613, + "logps/chosen": -243.2657470703125, + "logps/rejected": -609.9871826171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.458897113800049, + "rewards/margins": 34.001102447509766, + "rewards/rejected": -36.459999084472656, + "step": 70 + }, + { + "epoch": 0.5, + "grad_norm": 0.09491048856881475, + "learning_rate": 4.849773749484989e-07, + "logits/chosen": -2.6266448497772217, + "logits/rejected": -2.646352767944336, + "logps/chosen": -216.81683349609375, + "logps/rejected": -673.9139404296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.803138494491577, + "rewards/margins": 42.61211395263672, + "rewards/rejected": -45.415252685546875, + "step": 71 + }, + { + "epoch": 0.51, + "grad_norm": 0.0008959512432802318, + "learning_rate": 4.84485992450163e-07, + "logits/chosen": -2.6601815223693848, + "logits/rejected": -2.579123020172119, + "logps/chosen": -261.23162841796875, + "logps/rejected": -696.0081787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5922129154205322, + "rewards/margins": 39.79978942871094, + "rewards/rejected": -42.392005920410156, + "step": 72 + }, + { + "epoch": 0.52, + "grad_norm": 0.0010855851275110065, + "learning_rate": 4.839869594470642e-07, + "logits/chosen": -2.6099042892456055, + "logits/rejected": -2.545607089996338, + "logps/chosen": -256.48431396484375, + "logps/rejected": -641.6387939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8345205783843994, + "rewards/margins": 34.13178253173828, + "rewards/rejected": -37.96630096435547, + "step": 73 + }, + { + "epoch": 0.53, + "grad_norm": 0.7536169081768569, + "learning_rate": 4.834802922210039e-07, + "logits/chosen": -2.620169162750244, + "logits/rejected": -2.5841073989868164, + "logps/chosen": -279.88775634765625, + "logps/rejected": -678.869873046875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.943654537200928, + "rewards/margins": 35.603397369384766, + "rewards/rejected": -40.547054290771484, + "step": 74 + }, + { + "epoch": 0.53, + "grad_norm": 0.7997234243313621, + "learning_rate": 4.829660073028631e-07, + "logits/chosen": -2.5955264568328857, + "logits/rejected": -2.5334062576293945, + "logps/chosen": -253.80538940429688, + "logps/rejected": -673.9427490234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.711472749710083, + "rewards/margins": 36.90414810180664, + "rewards/rejected": -39.615623474121094, + "step": 75 + }, + { + "epoch": 0.54, + "grad_norm": 0.006658224494615943, + "learning_rate": 4.824441214720628e-07, + "logits/chosen": -2.6248621940612793, + "logits/rejected": -2.558454990386963, + "logps/chosen": -251.99954223632812, + "logps/rejected": -583.3997192382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.744347095489502, + "rewards/margins": 31.197547912597656, + "rewards/rejected": -34.94189453125, + "step": 76 + }, + { + "epoch": 0.55, + "grad_norm": 0.00035202099957007887, + "learning_rate": 4.81914651756017e-07, + "logits/chosen": -2.5931782722473145, + "logits/rejected": -2.5209240913391113, + "logps/chosen": -220.61691284179688, + "logps/rejected": -619.3521728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.440421462059021, + "rewards/margins": 36.423240661621094, + "rewards/rejected": -37.86366271972656, + "step": 77 + }, + { + "epoch": 0.55, + "grad_norm": 0.020705129779926876, + "learning_rate": 4.813776154295766e-07, + "logits/chosen": -2.566633701324463, + "logits/rejected": -2.510991096496582, + "logps/chosen": -220.09332275390625, + "logps/rejected": -604.5810546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2026134729385376, + "rewards/margins": 35.91285705566406, + "rewards/rejected": -37.11547088623047, + "step": 78 + }, + { + "epoch": 0.56, + "grad_norm": 0.0010525837529717053, + "learning_rate": 4.808330300144663e-07, + "logits/chosen": -2.5725741386413574, + "logits/rejected": -2.5592739582061768, + "logps/chosen": -240.62181091308594, + "logps/rejected": -686.8818969726562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.369328022003174, + "rewards/margins": 40.23169708251953, + "rewards/rejected": -44.60102844238281, + "step": 79 + }, + { + "epoch": 0.57, + "grad_norm": 0.002626900634782272, + "learning_rate": 4.802809132787125e-07, + "logits/chosen": -2.5818638801574707, + "logits/rejected": -2.5948946475982666, + "logps/chosen": -207.4244384765625, + "logps/rejected": -694.197021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1585419178009033, + "rewards/margins": 43.700706481933594, + "rewards/rejected": -44.859249114990234, + "step": 80 + }, + { + "epoch": 0.58, + "grad_norm": 0.00015381022664025858, + "learning_rate": 4.797212832360637e-07, + "logits/chosen": -2.6559855937957764, + "logits/rejected": -2.605440616607666, + "logps/chosen": -258.9279479980469, + "logps/rejected": -669.9668579101562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9578224420547485, + "rewards/margins": 38.35432434082031, + "rewards/rejected": -39.31214141845703, + "step": 81 + }, + { + "epoch": 0.58, + "grad_norm": 8.14746214616701, + "learning_rate": 4.79154158145403e-07, + "logits/chosen": -2.5828893184661865, + "logits/rejected": -2.5167794227600098, + "logps/chosen": -245.53335571289062, + "logps/rejected": -567.3512573242188, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.574049472808838, + "rewards/margins": 30.266849517822266, + "rewards/rejected": -33.84090042114258, + "step": 82 + }, + { + "epoch": 0.59, + "grad_norm": 44.627080331193405, + "learning_rate": 4.785795565101519e-07, + "logits/chosen": -2.619720458984375, + "logits/rejected": -2.565762758255005, + "logps/chosen": -231.5816192626953, + "logps/rejected": -630.4521484375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.968224287033081, + "rewards/margins": 34.13584899902344, + "rewards/rejected": -37.10407638549805, + "step": 83 + }, + { + "epoch": 0.6, + "grad_norm": 0.000374858838067523, + "learning_rate": 4.779974970776675e-07, + "logits/chosen": -2.516183614730835, + "logits/rejected": -2.486583709716797, + "logps/chosen": -213.62777709960938, + "logps/rejected": -603.7626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4763965606689453, + "rewards/margins": 36.45835494995117, + "rewards/rejected": -38.934749603271484, + "step": 84 + }, + { + "epoch": 0.6, + "grad_norm": 0.005376780888027256, + "learning_rate": 4.774079988386296e-07, + "logits/chosen": -2.604816436767578, + "logits/rejected": -2.582223892211914, + "logps/chosen": -231.11090087890625, + "logps/rejected": -647.1555786132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.299638748168945, + "rewards/margins": 36.099666595458984, + "rewards/rejected": -41.39930725097656, + "step": 85 + }, + { + "epoch": 0.61, + "grad_norm": 0.0066018632586047924, + "learning_rate": 4.76811081026422e-07, + "logits/chosen": -2.6014785766601562, + "logits/rejected": -2.575904369354248, + "logps/chosen": -213.63560485839844, + "logps/rejected": -630.609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.196646690368652, + "rewards/margins": 34.880653381347656, + "rewards/rejected": -39.077301025390625, + "step": 86 + }, + { + "epoch": 0.62, + "grad_norm": 0.027784301225530984, + "learning_rate": 4.762067631165049e-07, + "logits/chosen": -2.6610922813415527, + "logits/rejected": -2.580592632293701, + "logps/chosen": -250.07308959960938, + "logps/rejected": -630.8294067382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9165632724761963, + "rewards/margins": 33.92185974121094, + "rewards/rejected": -36.83842468261719, + "step": 87 + }, + { + "epoch": 0.63, + "grad_norm": 12.57471505813362, + "learning_rate": 4.755950648257788e-07, + "logits/chosen": -2.643458366394043, + "logits/rejected": -2.6349523067474365, + "logps/chosen": -190.123779296875, + "logps/rejected": -601.351318359375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7773460149765015, + "rewards/margins": 39.19086837768555, + "rewards/rejected": -40.96821212768555, + "step": 88 + }, + { + "epoch": 0.63, + "grad_norm": 0.0048020455296621046, + "learning_rate": 4.7497600611194223e-07, + "logits/chosen": -2.5902414321899414, + "logits/rejected": -2.5505082607269287, + "logps/chosen": -252.64024353027344, + "logps/rejected": -637.9603881835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.465909481048584, + "rewards/margins": 34.50959396362305, + "rewards/rejected": -37.975502014160156, + "step": 89 + }, + { + "epoch": 0.64, + "grad_norm": 0.0009230595218726257, + "learning_rate": 4.743496071728396e-07, + "logits/chosen": -2.585861921310425, + "logits/rejected": -2.558704137802124, + "logps/chosen": -229.53146362304688, + "logps/rejected": -641.9647827148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3721907138824463, + "rewards/margins": 37.53388595581055, + "rewards/rejected": -38.90607452392578, + "step": 90 + }, + { + "epoch": 0.65, + "grad_norm": 0.05271175571455843, + "learning_rate": 4.7371588844580296e-07, + "logits/chosen": -2.645639181137085, + "logits/rejected": -2.5097503662109375, + "logps/chosen": -235.87155151367188, + "logps/rejected": -596.9464111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9649837017059326, + "rewards/margins": 33.518882751464844, + "rewards/rejected": -36.48386764526367, + "step": 91 + }, + { + "epoch": 0.65, + "grad_norm": 0.013229322400623182, + "learning_rate": 4.730748706069848e-07, + "logits/chosen": -2.62890625, + "logits/rejected": -2.57592511177063, + "logps/chosen": -213.96942138671875, + "logps/rejected": -676.9241943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.331294059753418, + "rewards/margins": 42.68187713623047, + "rewards/rejected": -44.01317596435547, + "step": 92 + }, + { + "epoch": 0.66, + "grad_norm": 0.011218291590593801, + "learning_rate": 4.724265745706836e-07, + "logits/chosen": -2.4631335735321045, + "logits/rejected": -2.497346878051758, + "logps/chosen": -244.53843688964844, + "logps/rejected": -643.1104736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.857229709625244, + "rewards/margins": 32.60251998901367, + "rewards/rejected": -37.459754943847656, + "step": 93 + }, + { + "epoch": 0.67, + "grad_norm": 1.6120150642863533, + "learning_rate": 4.7177102148866135e-07, + "logits/chosen": -2.6028716564178467, + "logits/rejected": -2.5135581493377686, + "logps/chosen": -226.32772827148438, + "logps/rejected": -572.7806396484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5566359758377075, + "rewards/margins": 33.74810028076172, + "rewards/rejected": -35.30474090576172, + "step": 94 + }, + { + "epoch": 0.67, + "grad_norm": 18.875488988700052, + "learning_rate": 4.7110823274945357e-07, + "logits/chosen": -2.5668301582336426, + "logits/rejected": -2.554222583770752, + "logps/chosen": -234.818359375, + "logps/rejected": -609.0420532226562, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9635541439056396, + "rewards/margins": 33.658660888671875, + "rewards/rejected": -35.622215270996094, + "step": 95 + }, + { + "epoch": 0.68, + "grad_norm": 15.156790979675021, + "learning_rate": 4.704382299776714e-07, + "logits/chosen": -2.535076856613159, + "logits/rejected": -2.505333423614502, + "logps/chosen": -196.57296752929688, + "logps/rejected": -575.2232666015625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0395612716674805, + "rewards/margins": 35.05572509765625, + "rewards/rejected": -36.09528732299805, + "step": 96 + }, + { + "epoch": 0.69, + "grad_norm": 0.004157306395281978, + "learning_rate": 4.697610350332961e-07, + "logits/chosen": -2.6655004024505615, + "logits/rejected": -2.5744552612304688, + "logps/chosen": -255.20083618164062, + "logps/rejected": -598.7930908203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4714818000793457, + "rewards/margins": 33.17194366455078, + "rewards/rejected": -33.6434326171875, + "step": 97 + }, + { + "epoch": 0.7, + "grad_norm": 0.0028904761238828475, + "learning_rate": 4.6907667001096585e-07, + "logits/chosen": -2.596937656402588, + "logits/rejected": -2.476288318634033, + "logps/chosen": -215.81704711914062, + "logps/rejected": -578.166259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9031833410263062, + "rewards/margins": 32.43394470214844, + "rewards/rejected": -33.33712387084961, + "step": 98 + }, + { + "epoch": 0.7, + "grad_norm": 2.234337837593256, + "learning_rate": 4.6838515723925476e-07, + "logits/chosen": -2.5667247772216797, + "logits/rejected": -2.561495780944824, + "logps/chosen": -214.50173950195312, + "logps/rejected": -663.0035400390625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7073841094970703, + "rewards/margins": 36.75779724121094, + "rewards/rejected": -38.46518325805664, + "step": 99 + }, + { + "epoch": 0.71, + "grad_norm": 0.0034950719287433346, + "learning_rate": 4.676865192799443e-07, + "logits/chosen": -2.587968349456787, + "logits/rejected": -2.545469284057617, + "logps/chosen": -251.37530517578125, + "logps/rejected": -662.8966064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5641937255859375, + "rewards/margins": 36.153560638427734, + "rewards/rejected": -36.71775436401367, + "step": 100 + }, + { + "epoch": 0.72, + "grad_norm": 0.02881877600023013, + "learning_rate": 4.669807789272876e-07, + "logits/chosen": -2.633981943130493, + "logits/rejected": -2.5468287467956543, + "logps/chosen": -243.59738159179688, + "logps/rejected": -646.7576904296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08092263340950012, + "rewards/margins": 36.83740234375, + "rewards/rejected": -36.91832733154297, + "step": 101 + }, + { + "epoch": 0.72, + "grad_norm": 0.10074327685985839, + "learning_rate": 4.6626795920726527e-07, + "logits/chosen": -2.583199977874756, + "logits/rejected": -2.5085253715515137, + "logps/chosen": -221.7645263671875, + "logps/rejected": -634.94970703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6072038412094116, + "rewards/margins": 41.36707305908203, + "rewards/rejected": -41.974281311035156, + "step": 102 + }, + { + "epoch": 0.73, + "grad_norm": 0.00026595454746943605, + "learning_rate": 4.655480833768344e-07, + "logits/chosen": -2.5912704467773438, + "logits/rejected": -2.492095708847046, + "logps/chosen": -239.21820068359375, + "logps/rejected": -556.9544677734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1503007411956787, + "rewards/margins": 29.770492553710938, + "rewards/rejected": -30.920791625976562, + "step": 103 + }, + { + "epoch": 0.74, + "grad_norm": 0.06312750808755165, + "learning_rate": 4.6482117492316975e-07, + "logits/chosen": -2.6155478954315186, + "logits/rejected": -2.567389488220215, + "logps/chosen": -249.32171630859375, + "logps/rejected": -611.0191040039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.161124587059021, + "rewards/margins": 35.573848724365234, + "rewards/rejected": -36.7349739074707, + "step": 104 + }, + { + "epoch": 0.75, + "grad_norm": 0.02637237918658033, + "learning_rate": 4.6408725756289725e-07, + "logits/chosen": -2.545167922973633, + "logits/rejected": -2.5693089962005615, + "logps/chosen": -207.8555145263672, + "logps/rejected": -607.4221801757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2742688655853271, + "rewards/margins": 32.774723052978516, + "rewards/rejected": -34.04899215698242, + "step": 105 + }, + { + "epoch": 0.75, + "grad_norm": 0.6488246897750426, + "learning_rate": 4.633463552413204e-07, + "logits/chosen": -2.58434796333313, + "logits/rejected": -2.552529811859131, + "logps/chosen": -210.70130920410156, + "logps/rejected": -570.949951171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4252532720565796, + "rewards/margins": 30.310836791992188, + "rewards/rejected": -31.73609161376953, + "step": 106 + }, + { + "epoch": 0.76, + "grad_norm": 0.02624969120554211, + "learning_rate": 4.6259849213163915e-07, + "logits/chosen": -2.56691837310791, + "logits/rejected": -2.522692918777466, + "logps/chosen": -210.0999755859375, + "logps/rejected": -580.938720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.854114294052124, + "rewards/margins": 34.5439453125, + "rewards/rejected": -37.39805603027344, + "step": 107 + }, + { + "epoch": 0.77, + "grad_norm": 0.004673035992259946, + "learning_rate": 4.618436926341606e-07, + "logits/chosen": -2.5112953186035156, + "logits/rejected": -2.565476894378662, + "logps/chosen": -213.46022033691406, + "logps/rejected": -608.3571166992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9263553619384766, + "rewards/margins": 32.940757751464844, + "rewards/rejected": -35.86711120605469, + "step": 108 + }, + { + "epoch": 0.77, + "grad_norm": 0.006713666563122458, + "learning_rate": 4.6108198137550377e-07, + "logits/chosen": -2.5288867950439453, + "logits/rejected": -2.578467845916748, + "logps/chosen": -185.68722534179688, + "logps/rejected": -582.3807373046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8703787326812744, + "rewards/margins": 34.04914855957031, + "rewards/rejected": -35.919525146484375, + "step": 109 + }, + { + "epoch": 0.78, + "grad_norm": 0.00568403946429323, + "learning_rate": 4.603133832077953e-07, + "logits/chosen": -2.5672121047973633, + "logits/rejected": -2.571394920349121, + "logps/chosen": -173.45330810546875, + "logps/rejected": -587.3894653320312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5344221591949463, + "rewards/margins": 33.35356521606445, + "rewards/rejected": -35.88798904418945, + "step": 110 + }, + { + "epoch": 0.79, + "grad_norm": 19.206255464738412, + "learning_rate": 4.595379232078591e-07, + "logits/chosen": -2.5190114974975586, + "logits/rejected": -2.5229198932647705, + "logps/chosen": -190.62655639648438, + "logps/rejected": -591.9658203125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.913595199584961, + "rewards/margins": 32.8871955871582, + "rewards/rejected": -35.80078887939453, + "step": 111 + }, + { + "epoch": 0.8, + "grad_norm": 0.011368564684718476, + "learning_rate": 4.5875562667639814e-07, + "logits/chosen": -2.6099767684936523, + "logits/rejected": -2.51930570602417, + "logps/chosen": -271.2085266113281, + "logps/rejected": -620.4912109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.517024040222168, + "rewards/margins": 34.295997619628906, + "rewards/rejected": -36.813018798828125, + "step": 112 + }, + { + "epoch": 0.8, + "grad_norm": 0.0026468683122158115, + "learning_rate": 4.5796651913716866e-07, + "logits/chosen": -2.6443378925323486, + "logits/rejected": -2.58347749710083, + "logps/chosen": -232.60769653320312, + "logps/rejected": -694.8738403320312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3937445282936096, + "rewards/margins": 40.25984191894531, + "rewards/rejected": -40.653587341308594, + "step": 113 + }, + { + "epoch": 0.81, + "grad_norm": 4.618182187370094, + "learning_rate": 4.571706263361479e-07, + "logits/chosen": -2.6016504764556885, + "logits/rejected": -2.4843671321868896, + "logps/chosen": -245.8649444580078, + "logps/rejected": -575.270263671875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6936731338500977, + "rewards/margins": 33.10950469970703, + "rewards/rejected": -34.80317687988281, + "step": 114 + }, + { + "epoch": 0.82, + "grad_norm": 0.27448241192379685, + "learning_rate": 4.563679742406935e-07, + "logits/chosen": -2.6160361766815186, + "logits/rejected": -2.5851187705993652, + "logps/chosen": -192.36016845703125, + "logps/rejected": -557.3076782226562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14625012874603271, + "rewards/margins": 31.823680877685547, + "rewards/rejected": -31.969928741455078, + "step": 115 + }, + { + "epoch": 0.82, + "grad_norm": 0.752924121648113, + "learning_rate": 4.555585890386968e-07, + "logits/chosen": -2.566042423248291, + "logits/rejected": -2.5212507247924805, + "logps/chosen": -261.55499267578125, + "logps/rejected": -662.49267578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5800769329071045, + "rewards/margins": 36.31473159790039, + "rewards/rejected": -38.89480972290039, + "step": 116 + }, + { + "epoch": 0.83, + "grad_norm": 0.005663806343638855, + "learning_rate": 4.5474249713772815e-07, + "logits/chosen": -2.6562113761901855, + "logits/rejected": -2.5219335556030273, + "logps/chosen": -231.86148071289062, + "logps/rejected": -563.2797241210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.286268711090088, + "rewards/margins": 33.023834228515625, + "rewards/rejected": -35.31010437011719, + "step": 117 + }, + { + "epoch": 0.84, + "grad_norm": 0.7400849203509472, + "learning_rate": 4.539197251641754e-07, + "logits/chosen": -2.523650646209717, + "logits/rejected": -2.495345115661621, + "logps/chosen": -186.66293334960938, + "logps/rejected": -532.6724243164062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6615891456604004, + "rewards/margins": 32.66539001464844, + "rewards/rejected": -35.32698059082031, + "step": 118 + }, + { + "epoch": 0.85, + "grad_norm": 0.06862223292792247, + "learning_rate": 4.5309029996237513e-07, + "logits/chosen": -2.579279661178589, + "logits/rejected": -2.500725269317627, + "logps/chosen": -211.95346069335938, + "logps/rejected": -592.3385620117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.046866536140442, + "rewards/margins": 33.913909912109375, + "rewards/rejected": -34.96077346801758, + "step": 119 + }, + { + "epoch": 0.85, + "grad_norm": 0.024561571236493333, + "learning_rate": 4.5225424859373684e-07, + "logits/chosen": -2.622439384460449, + "logits/rejected": -2.5584168434143066, + "logps/chosen": -229.82626342773438, + "logps/rejected": -548.0466918945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.172595500946045, + "rewards/margins": 30.864500045776367, + "rewards/rejected": -32.03709411621094, + "step": 120 + }, + { + "epoch": 0.86, + "grad_norm": 0.015407667881027563, + "learning_rate": 4.514115983358599e-07, + "logits/chosen": -2.533438205718994, + "logits/rejected": -2.532291889190674, + "logps/chosen": -213.50225830078125, + "logps/rejected": -652.7657470703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5436322689056396, + "rewards/margins": 38.55556869506836, + "rewards/rejected": -39.09920120239258, + "step": 121 + }, + { + "epoch": 0.87, + "grad_norm": 0.20221878430074328, + "learning_rate": 4.5056237668164375e-07, + "logits/chosen": -2.604402780532837, + "logits/rejected": -2.536449909210205, + "logps/chosen": -269.92047119140625, + "logps/rejected": -636.8568725585938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.217914581298828, + "rewards/margins": 31.1136474609375, + "rewards/rejected": -33.331565856933594, + "step": 122 + }, + { + "epoch": 0.87, + "grad_norm": 0.0024634293098567037, + "learning_rate": 4.4970661133839094e-07, + "logits/chosen": -2.567944049835205, + "logits/rejected": -2.558134078979492, + "logps/chosen": -188.28123474121094, + "logps/rejected": -669.8164672851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.090487003326416, + "rewards/margins": 43.07219696044922, + "rewards/rejected": -45.162689208984375, + "step": 123 + }, + { + "epoch": 0.88, + "grad_norm": 0.20323566669180854, + "learning_rate": 4.4884433022690273e-07, + "logits/chosen": -2.6232047080993652, + "logits/rejected": -2.607207775115967, + "logps/chosen": -180.43475341796875, + "logps/rejected": -579.5249633789062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3108582496643066, + "rewards/margins": 35.67290115356445, + "rewards/rejected": -36.98375701904297, + "step": 124 + }, + { + "epoch": 0.89, + "grad_norm": 0.24231928367738273, + "learning_rate": 4.4797556148056876e-07, + "logits/chosen": -2.5931971073150635, + "logits/rejected": -2.559455394744873, + "logps/chosen": -242.80609130859375, + "logps/rejected": -695.3267822265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3251227140426636, + "rewards/margins": 43.132415771484375, + "rewards/rejected": -44.45753860473633, + "step": 125 + }, + { + "epoch": 0.9, + "grad_norm": 0.011024666873025644, + "learning_rate": 4.4710033344444853e-07, + "logits/chosen": -2.5954768657684326, + "logits/rejected": -2.501197576522827, + "logps/chosen": -230.17742919921875, + "logps/rejected": -611.9140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.497666835784912, + "rewards/margins": 35.788063049316406, + "rewards/rejected": -37.285728454589844, + "step": 126 + }, + { + "epoch": 0.9, + "grad_norm": 0.00157172399275407, + "learning_rate": 4.4621867467434706e-07, + "logits/chosen": -2.5101070404052734, + "logits/rejected": -2.569409132003784, + "logps/chosen": -199.55877685546875, + "logps/rejected": -614.8193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.306656837463379, + "rewards/margins": 34.86344909667969, + "rewards/rejected": -38.17010498046875, + "step": 127 + }, + { + "epoch": 0.91, + "grad_norm": 2.020817965291877, + "learning_rate": 4.4533061393588276e-07, + "logits/chosen": -2.624446153640747, + "logits/rejected": -2.5561485290527344, + "logps/chosen": -252.4219207763672, + "logps/rejected": -670.9520263671875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4106292724609375, + "rewards/margins": 39.53878402709961, + "rewards/rejected": -41.94940948486328, + "step": 128 + }, + { + "epoch": 0.92, + "grad_norm": 26.64416979329211, + "learning_rate": 4.4443618020354947e-07, + "logits/chosen": -2.602850914001465, + "logits/rejected": -2.550997495651245, + "logps/chosen": -268.5458984375, + "logps/rejected": -704.013427734375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0245444774627686, + "rewards/margins": 37.89887619018555, + "rewards/rejected": -39.92341995239258, + "step": 129 + }, + { + "epoch": 0.92, + "grad_norm": 4.993116119304177, + "learning_rate": 4.4353540265977065e-07, + "logits/chosen": -2.582940101623535, + "logits/rejected": -2.529238700866699, + "logps/chosen": -253.8407440185547, + "logps/rejected": -698.4043579101562, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1980721950531006, + "rewards/margins": 40.42285919189453, + "rewards/rejected": -43.62092971801758, + "step": 130 + }, + { + "epoch": 0.93, + "grad_norm": 2.528866200026442, + "learning_rate": 4.426283106939473e-07, + "logits/chosen": -2.564192533493042, + "logits/rejected": -2.5703039169311523, + "logps/chosen": -218.78802490234375, + "logps/rejected": -653.0299072265625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5778160095214844, + "rewards/margins": 42.9122314453125, + "rewards/rejected": -44.49005126953125, + "step": 131 + }, + { + "epoch": 0.94, + "grad_norm": 96.08086075829229, + "learning_rate": 4.417149339014994e-07, + "logits/chosen": -2.5739612579345703, + "logits/rejected": -2.5125672817230225, + "logps/chosen": -240.15184020996094, + "logps/rejected": -621.1173706054688, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6627590656280518, + "rewards/margins": 33.45075988769531, + "rewards/rejected": -35.11351776123047, + "step": 132 + }, + { + "epoch": 0.94, + "grad_norm": 2.8796633734164636e-05, + "learning_rate": 4.4079530208290005e-07, + "logits/chosen": -2.567087411880493, + "logits/rejected": -2.544320583343506, + "logps/chosen": -218.01675415039062, + "logps/rejected": -676.8677978515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3365082740783691, + "rewards/margins": 40.27417755126953, + "rewards/rejected": -41.61068344116211, + "step": 133 + }, + { + "epoch": 0.95, + "grad_norm": 0.09372658540539028, + "learning_rate": 4.3986944524270314e-07, + "logits/chosen": -2.600703477859497, + "logits/rejected": -2.58151912689209, + "logps/chosen": -256.88250732421875, + "logps/rejected": -738.823974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.444084644317627, + "rewards/margins": 43.63507843017578, + "rewards/rejected": -47.079158782958984, + "step": 134 + }, + { + "epoch": 0.96, + "grad_norm": 0.1727634262429737, + "learning_rate": 4.3893739358856455e-07, + "logits/chosen": -2.563455104827881, + "logits/rejected": -2.5491368770599365, + "logps/chosen": -199.9803924560547, + "logps/rejected": -648.3558349609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7440325021743774, + "rewards/margins": 37.6333122253418, + "rewards/rejected": -39.377342224121094, + "step": 135 + }, + { + "epoch": 0.97, + "grad_norm": 0.00036822927543868534, + "learning_rate": 4.379991775302565e-07, + "logits/chosen": -2.5781753063201904, + "logits/rejected": -2.6460652351379395, + "logps/chosen": -153.51654052734375, + "logps/rejected": -633.5337524414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8430655598640442, + "rewards/margins": 41.68687057495117, + "rewards/rejected": -42.52993392944336, + "step": 136 + }, + { + "epoch": 0.97, + "grad_norm": 0.02786703845195011, + "learning_rate": 4.370548276786753e-07, + "logits/chosen": -2.5456156730651855, + "logits/rejected": -2.5503318309783936, + "logps/chosen": -251.3705596923828, + "logps/rejected": -703.8067016601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.119581699371338, + "rewards/margins": 40.984336853027344, + "rewards/rejected": -44.103919982910156, + "step": 137 + }, + { + "epoch": 0.98, + "grad_norm": 0.11152464735865304, + "learning_rate": 4.36104374844843e-07, + "logits/chosen": -2.6397767066955566, + "logits/rejected": -2.611351728439331, + "logps/chosen": -288.53515625, + "logps/rejected": -795.400634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1739721298217773, + "rewards/margins": 48.789794921875, + "rewards/rejected": -51.96376419067383, + "step": 138 + }, + { + "epoch": 0.99, + "grad_norm": 0.03777465441841697, + "learning_rate": 4.3514785003890143e-07, + "logits/chosen": -2.62467885017395, + "logits/rejected": -2.595163106918335, + "logps/chosen": -260.4969787597656, + "logps/rejected": -726.1373291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6143627166748047, + "rewards/margins": 45.63804244995117, + "rewards/rejected": -48.252403259277344, + "step": 139 + }, + { + "epoch": 0.99, + "grad_norm": 0.005669493140184591, + "learning_rate": 4.341852844691012e-07, + "logits/chosen": -2.610031843185425, + "logits/rejected": -2.621332883834839, + "logps/chosen": -216.76800537109375, + "logps/rejected": -719.2345581054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4822461605072021, + "rewards/margins": 44.279666900634766, + "rewards/rejected": -45.76191329956055, + "step": 140 + }, + { + "epoch": 1.0, + "grad_norm": 0.00022815768795647143, + "learning_rate": 4.3321670954078293e-07, + "logits/chosen": -2.5568954944610596, + "logits/rejected": -2.540750503540039, + "logps/chosen": -200.95025634765625, + "logps/rejected": -666.9190673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.238824486732483, + "rewards/margins": 44.067054748535156, + "rewards/rejected": -45.305877685546875, + "step": 141 + }, + { + "epoch": 1.01, + "grad_norm": 1.1539435459732833e-05, + "learning_rate": 4.3224215685535287e-07, + "logits/chosen": -2.583623170852661, + "logits/rejected": -2.60050630569458, + "logps/chosen": -199.88882446289062, + "logps/rejected": -718.0154418945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3592826128005981, + "rewards/margins": 46.54505920410156, + "rewards/rejected": -47.90433883666992, + "step": 142 + }, + { + "epoch": 1.02, + "grad_norm": 1.6280072864719233e-05, + "learning_rate": 4.312616582092517e-07, + "logits/chosen": -2.597187042236328, + "logits/rejected": -2.5648751258850098, + "logps/chosen": -253.212646484375, + "logps/rejected": -730.0873413085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9380908608436584, + "rewards/margins": 47.646400451660156, + "rewards/rejected": -48.58448791503906, + "step": 143 + }, + { + "epoch": 1.02, + "grad_norm": 2.8426121655878774e-06, + "learning_rate": 4.302752455929173e-07, + "logits/chosen": -2.570369243621826, + "logits/rejected": -2.5662100315093994, + "logps/chosen": -235.5956268310547, + "logps/rejected": -757.3215942382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.107633590698242, + "rewards/margins": 46.433197021484375, + "rewards/rejected": -48.54083251953125, + "step": 144 + }, + { + "epoch": 1.03, + "grad_norm": 2.90888089475232e-05, + "learning_rate": 4.292829511897409e-07, + "logits/chosen": -2.567291736602783, + "logits/rejected": -2.571481466293335, + "logps/chosen": -189.98292541503906, + "logps/rejected": -718.2835693359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7237987518310547, + "rewards/margins": 49.71271514892578, + "rewards/rejected": -50.4365119934082, + "step": 145 + }, + { + "epoch": 1.04, + "grad_norm": 0.0004662801303105172, + "learning_rate": 4.2828480737501684e-07, + "logits/chosen": -2.6226296424865723, + "logits/rejected": -2.651503801345825, + "logps/chosen": -143.9643096923828, + "logps/rejected": -657.91552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2402803897857666, + "rewards/margins": 43.428016662597656, + "rewards/rejected": -44.66829299926758, + "step": 146 + }, + { + "epoch": 1.04, + "grad_norm": 5.588200829295209e-05, + "learning_rate": 4.2728084671488665e-07, + "logits/chosen": -2.635173797607422, + "logits/rejected": -2.581272602081299, + "logps/chosen": -265.34490966796875, + "logps/rejected": -742.0103759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.196270704269409, + "rewards/margins": 46.004974365234375, + "rewards/rejected": -48.20124816894531, + "step": 147 + }, + { + "epoch": 1.05, + "grad_norm": 0.00013017261692089549, + "learning_rate": 4.262711019652764e-07, + "logits/chosen": -2.5791471004486084, + "logits/rejected": -2.6228065490722656, + "logps/chosen": -225.77862548828125, + "logps/rejected": -809.488037109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.141911268234253, + "rewards/margins": 51.57756805419922, + "rewards/rejected": -52.719478607177734, + "step": 148 + }, + { + "epoch": 1.06, + "grad_norm": 0.01473090273235331, + "learning_rate": 4.2525560607082766e-07, + "logits/chosen": -2.573683261871338, + "logits/rejected": -2.5171637535095215, + "logps/chosen": -193.02401733398438, + "logps/rejected": -565.2341918945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4484729766845703, + "rewards/margins": 34.94745635986328, + "rewards/rejected": -37.395931243896484, + "step": 149 + }, + { + "epoch": 1.07, + "grad_norm": 1.5786956500245529e-06, + "learning_rate": 4.242343921638234e-07, + "logits/chosen": -2.596616744995117, + "logits/rejected": -2.567257881164551, + "logps/chosen": -233.6993865966797, + "logps/rejected": -700.4785766601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.787569522857666, + "rewards/margins": 44.17692565917969, + "rewards/rejected": -46.96449661254883, + "step": 150 + }, + { + "epoch": 1.07, + "grad_norm": 0.4483012557943752, + "learning_rate": 4.232074935631058e-07, + "logits/chosen": -2.5706634521484375, + "logits/rejected": -2.5774495601654053, + "logps/chosen": -201.2030029296875, + "logps/rejected": -679.729248046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7499289512634277, + "rewards/margins": 41.2647819519043, + "rewards/rejected": -44.01470947265625, + "step": 151 + }, + { + "epoch": 1.08, + "grad_norm": 0.0003056489547856762, + "learning_rate": 4.221749437729904e-07, + "logits/chosen": -2.575948715209961, + "logits/rejected": -2.5527901649475098, + "logps/chosen": -209.8745880126953, + "logps/rejected": -685.0315551757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.781831979751587, + "rewards/margins": 42.202980041503906, + "rewards/rejected": -43.98480987548828, + "step": 152 + }, + { + "epoch": 1.09, + "grad_norm": 8.481515068637732e-05, + "learning_rate": 4.2113677648217216e-07, + "logits/chosen": -2.5718917846679688, + "logits/rejected": -2.595346450805664, + "logps/chosen": -196.04486083984375, + "logps/rejected": -672.2835083007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2190161943435669, + "rewards/margins": 44.05103302001953, + "rewards/rejected": -44.270050048828125, + "step": 153 + }, + { + "epoch": 1.09, + "grad_norm": 0.0001820077784408538, + "learning_rate": 4.2009302556262667e-07, + "logits/chosen": -2.5870463848114014, + "logits/rejected": -2.5880537033081055, + "logps/chosen": -244.90872192382812, + "logps/rejected": -696.75390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.998746395111084, + "rewards/margins": 39.66267776489258, + "rewards/rejected": -43.66142272949219, + "step": 154 + }, + { + "epoch": 1.1, + "grad_norm": 0.00038598988091085825, + "learning_rate": 4.1904372506850483e-07, + "logits/chosen": -2.5988783836364746, + "logits/rejected": -2.6122748851776123, + "logps/chosen": -228.02015686035156, + "logps/rejected": -775.085693359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.350602149963379, + "rewards/margins": 49.67835235595703, + "rewards/rejected": -52.02894973754883, + "step": 155 + }, + { + "epoch": 1.11, + "grad_norm": 0.0008931567378222045, + "learning_rate": 4.1798890923502196e-07, + "logits/chosen": -2.561432123184204, + "logits/rejected": -2.5843517780303955, + "logps/chosen": -190.72703552246094, + "logps/rejected": -697.0485229492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.451295852661133, + "rewards/margins": 45.76554489135742, + "rewards/rejected": -48.21683883666992, + "step": 156 + }, + { + "epoch": 1.12, + "grad_norm": 2.314461351482945e-06, + "learning_rate": 4.169286124773406e-07, + "logits/chosen": -2.552542209625244, + "logits/rejected": -2.601801872253418, + "logps/chosen": -179.37725830078125, + "logps/rejected": -698.3157958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21464112401008606, + "rewards/margins": 49.06697463989258, + "rewards/rejected": -48.852333068847656, + "step": 157 + }, + { + "epoch": 1.12, + "grad_norm": 0.0012384481275486917, + "learning_rate": 4.158628693894479e-07, + "logits/chosen": -2.6161797046661377, + "logits/rejected": -2.582975149154663, + "logps/chosen": -259.7201232910156, + "logps/rejected": -790.1285400390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5851057767868042, + "rewards/margins": 53.555458068847656, + "rewards/rejected": -55.140567779541016, + "step": 158 + }, + { + "epoch": 1.13, + "grad_norm": 2.4609103703280878e-06, + "learning_rate": 4.147917147430267e-07, + "logits/chosen": -2.6165831089019775, + "logits/rejected": -2.5431764125823975, + "logps/chosen": -211.67347717285156, + "logps/rejected": -623.53662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.21859872341156, + "rewards/margins": 37.740482330322266, + "rewards/rejected": -38.95907974243164, + "step": 159 + }, + { + "epoch": 1.14, + "grad_norm": 3.938661615116637e-05, + "learning_rate": 4.137151834863213e-07, + "logits/chosen": -2.6515235900878906, + "logits/rejected": -2.5725584030151367, + "logps/chosen": -253.0316619873047, + "logps/rejected": -694.6533813476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4212515354156494, + "rewards/margins": 39.18084716796875, + "rewards/rejected": -40.60210037231445, + "step": 160 + }, + { + "epoch": 1.14, + "grad_norm": 0.2732381903820897, + "learning_rate": 4.126333107429967e-07, + "logits/chosen": -2.6154708862304688, + "logits/rejected": -2.5370631217956543, + "logps/chosen": -208.90463256835938, + "logps/rejected": -667.1223754882812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0636826753616333, + "rewards/margins": 45.90496826171875, + "rewards/rejected": -45.841285705566406, + "step": 161 + }, + { + "epoch": 1.15, + "grad_norm": 0.0004167685667905231, + "learning_rate": 4.115461318109935e-07, + "logits/chosen": -2.564563274383545, + "logits/rejected": -2.5531859397888184, + "logps/chosen": -214.35052490234375, + "logps/rejected": -778.1734619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6587705612182617, + "rewards/margins": 48.74113464355469, + "rewards/rejected": -51.39990234375, + "step": 162 + }, + { + "epoch": 1.16, + "grad_norm": 0.016492792863443486, + "learning_rate": 4.1045368216137544e-07, + "logits/chosen": -2.664215326309204, + "logits/rejected": -2.621072769165039, + "logps/chosen": -216.53549194335938, + "logps/rejected": -635.9772338867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3723301887512207, + "rewards/margins": 38.74686050415039, + "rewards/rejected": -41.11918640136719, + "step": 163 + }, + { + "epoch": 1.17, + "grad_norm": 0.00017774534192921193, + "learning_rate": 4.0935599743717244e-07, + "logits/chosen": -2.587266683578491, + "logits/rejected": -2.576569080352783, + "logps/chosen": -230.09112548828125, + "logps/rejected": -710.7093505859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.168027400970459, + "rewards/margins": 45.01225280761719, + "rewards/rejected": -47.18028259277344, + "step": 164 + }, + { + "epoch": 1.17, + "grad_norm": 0.00015615151720778332, + "learning_rate": 4.082531134522176e-07, + "logits/chosen": -2.6086883544921875, + "logits/rejected": -2.6074631214141846, + "logps/chosen": -204.37579345703125, + "logps/rejected": -715.26025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1054762601852417, + "rewards/margins": 44.97893524169922, + "rewards/rejected": -46.08441162109375, + "step": 165 + }, + { + "epoch": 1.18, + "grad_norm": 0.004770568419420543, + "learning_rate": 4.0714506618997883e-07, + "logits/chosen": -2.6288251876831055, + "logits/rejected": -2.619809150695801, + "logps/chosen": -213.45382690429688, + "logps/rejected": -591.2918090820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8026537895202637, + "rewards/margins": 36.34064865112305, + "rewards/rejected": -39.14330291748047, + "step": 166 + }, + { + "epoch": 1.19, + "grad_norm": 0.002046089717361918, + "learning_rate": 4.0603189180238486e-07, + "logits/chosen": -2.6056103706359863, + "logits/rejected": -2.569298028945923, + "logps/chosen": -273.2747802734375, + "logps/rejected": -730.37646484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.155305862426758, + "rewards/margins": 44.866031646728516, + "rewards/rejected": -47.021339416503906, + "step": 167 + }, + { + "epoch": 1.19, + "grad_norm": 0.009187113470484643, + "learning_rate": 4.0491362660864523e-07, + "logits/chosen": -2.6161460876464844, + "logits/rejected": -2.5576770305633545, + "logps/chosen": -190.67861938476562, + "logps/rejected": -653.6278076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3470852375030518, + "rewards/margins": 44.159393310546875, + "rewards/rejected": -46.5064811706543, + "step": 168 + }, + { + "epoch": 1.2, + "grad_norm": 0.00022681304100481196, + "learning_rate": 4.0379030709406623e-07, + "logits/chosen": -2.574169397354126, + "logits/rejected": -2.6030027866363525, + "logps/chosen": -182.76925659179688, + "logps/rejected": -678.37109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7692532539367676, + "rewards/margins": 43.3445930480957, + "rewards/rejected": -45.11384582519531, + "step": 169 + }, + { + "epoch": 1.21, + "grad_norm": 0.0004674740795912675, + "learning_rate": 4.0266196990885955e-07, + "logits/chosen": -2.6516175270080566, + "logits/rejected": -2.598280906677246, + "logps/chosen": -250.63906860351562, + "logps/rejected": -731.0950927734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.556765079498291, + "rewards/margins": 44.50921630859375, + "rewards/rejected": -47.065982818603516, + "step": 170 + }, + { + "epoch": 1.21, + "grad_norm": 0.001699479334776146, + "learning_rate": 4.0152865186694706e-07, + "logits/chosen": -2.571397066116333, + "logits/rejected": -2.548680543899536, + "logps/chosen": -258.6770324707031, + "logps/rejected": -731.6434326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7200090885162354, + "rewards/margins": 43.29862594604492, + "rewards/rejected": -45.01863479614258, + "step": 171 + }, + { + "epoch": 1.22, + "grad_norm": 0.002061409386094836, + "learning_rate": 4.0039038994475967e-07, + "logits/chosen": -2.58412766456604, + "logits/rejected": -2.4661545753479004, + "logps/chosen": -231.63372802734375, + "logps/rejected": -606.4661865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.567789077758789, + "rewards/margins": 36.220664978027344, + "rewards/rejected": -38.7884521484375, + "step": 172 + }, + { + "epoch": 1.23, + "grad_norm": 4.585800977645965e-05, + "learning_rate": 3.9924722128003064e-07, + "logits/chosen": -2.6837406158447266, + "logits/rejected": -2.5585079193115234, + "logps/chosen": -285.98602294921875, + "logps/rejected": -727.728759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.999790072441101, + "rewards/margins": 47.17414093017578, + "rewards/rejected": -49.17393493652344, + "step": 173 + }, + { + "epoch": 1.24, + "grad_norm": 8.589266392645064e-05, + "learning_rate": 3.980991831705842e-07, + "logits/chosen": -2.571767807006836, + "logits/rejected": -2.582883358001709, + "logps/chosen": -219.5039520263672, + "logps/rejected": -718.7467651367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6085176467895508, + "rewards/margins": 44.41033935546875, + "rewards/rejected": -46.01885986328125, + "step": 174 + }, + { + "epoch": 1.24, + "grad_norm": 4.3088882230925214e-05, + "learning_rate": 3.9694631307311825e-07, + "logits/chosen": -2.5392234325408936, + "logits/rejected": -2.5425832271575928, + "logps/chosen": -227.6947021484375, + "logps/rejected": -700.3865966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.487729787826538, + "rewards/margins": 42.523345947265625, + "rewards/rejected": -45.011077880859375, + "step": 175 + }, + { + "epoch": 1.25, + "grad_norm": 0.0006816807836120795, + "learning_rate": 3.9578864860198297e-07, + "logits/chosen": -2.6063621044158936, + "logits/rejected": -2.589238405227661, + "logps/chosen": -247.16705322265625, + "logps/rejected": -737.020751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.238234758377075, + "rewards/margins": 43.935325622558594, + "rewards/rejected": -47.173561096191406, + "step": 176 + }, + { + "epoch": 1.26, + "grad_norm": 0.03519763376705599, + "learning_rate": 3.9462622752795273e-07, + "logits/chosen": -2.5918827056884766, + "logits/rejected": -2.640364646911621, + "logps/chosen": -218.81558227539062, + "logps/rejected": -705.66259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3849472999572754, + "rewards/margins": 41.51541519165039, + "rewards/rejected": -44.90036392211914, + "step": 177 + }, + { + "epoch": 1.26, + "grad_norm": 5.3102905782567814e-05, + "learning_rate": 3.9345908777699433e-07, + "logits/chosen": -2.648378610610962, + "logits/rejected": -2.5747437477111816, + "logps/chosen": -260.0047302246094, + "logps/rejected": -680.903076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4895734786987305, + "rewards/margins": 45.286949157714844, + "rewards/rejected": -47.776519775390625, + "step": 178 + }, + { + "epoch": 1.27, + "grad_norm": 0.0004104546156054008, + "learning_rate": 3.922872674290295e-07, + "logits/chosen": -2.602609157562256, + "logits/rejected": -2.5364885330200195, + "logps/chosen": -190.64035034179688, + "logps/rejected": -616.9769287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.219014883041382, + "rewards/margins": 37.46174621582031, + "rewards/rejected": -39.680763244628906, + "step": 179 + }, + { + "epoch": 1.28, + "grad_norm": 0.0008331830103905508, + "learning_rate": 3.9111080471669233e-07, + "logits/chosen": -2.6162502765655518, + "logits/rejected": -2.5387635231018066, + "logps/chosen": -242.56565856933594, + "logps/rejected": -742.8907470703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1962075233459473, + "rewards/margins": 45.26032638549805, + "rewards/rejected": -48.45653533935547, + "step": 180 + }, + { + "epoch": 1.29, + "grad_norm": 7.374346407645852e-05, + "learning_rate": 3.8992973802408184e-07, + "logits/chosen": -2.569359302520752, + "logits/rejected": -2.5742392539978027, + "logps/chosen": -214.93621826171875, + "logps/rejected": -727.053955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7737250328063965, + "rewards/margins": 48.266357421875, + "rewards/rejected": -52.04008102416992, + "step": 181 + }, + { + "epoch": 1.29, + "grad_norm": 0.008639194062017252, + "learning_rate": 3.887441058855101e-07, + "logits/chosen": -2.5811495780944824, + "logits/rejected": -2.542731761932373, + "logps/chosen": -183.28643798828125, + "logps/rejected": -658.311767578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.03851318359375, + "rewards/margins": 44.81935119628906, + "rewards/rejected": -46.85786437988281, + "step": 182 + }, + { + "epoch": 1.3, + "grad_norm": 1.9626277330936126e-06, + "learning_rate": 3.8755394698424426e-07, + "logits/chosen": -2.568392276763916, + "logits/rejected": -2.5274596214294434, + "logps/chosen": -210.89170837402344, + "logps/rejected": -621.986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1594672203063965, + "rewards/margins": 33.15226364135742, + "rewards/rejected": -37.311729431152344, + "step": 183 + }, + { + "epoch": 1.31, + "grad_norm": 1.3327751424425554e-05, + "learning_rate": 3.86359300151245e-07, + "logits/chosen": -2.6583023071289062, + "logits/rejected": -2.575366735458374, + "logps/chosen": -301.7183837890625, + "logps/rejected": -715.0674438476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.005529880523682, + "rewards/margins": 36.15945053100586, + "rewards/rejected": -40.16498565673828, + "step": 184 + }, + { + "epoch": 1.31, + "grad_norm": 0.0015136697168244824, + "learning_rate": 3.851602043638994e-07, + "logits/chosen": -2.676177978515625, + "logits/rejected": -2.6057064533233643, + "logps/chosen": -321.2886962890625, + "logps/rejected": -789.05517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1413521766662598, + "rewards/margins": 47.196720123291016, + "rewards/rejected": -49.338069915771484, + "step": 185 + }, + { + "epoch": 1.32, + "grad_norm": 0.004731883171088637, + "learning_rate": 3.839566987447491e-07, + "logits/chosen": -2.5592198371887207, + "logits/rejected": -2.5781192779541016, + "logps/chosen": -194.6182403564453, + "logps/rejected": -719.5448608398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9351108074188232, + "rewards/margins": 44.968292236328125, + "rewards/rejected": -46.903404235839844, + "step": 186 + }, + { + "epoch": 1.33, + "grad_norm": 0.00013205370140313423, + "learning_rate": 3.8274882256021433e-07, + "logits/chosen": -2.5751876831054688, + "logits/rejected": -2.587034225463867, + "logps/chosen": -222.43740844726562, + "logps/rejected": -640.2822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4436620473861694, + "rewards/margins": 40.89015197753906, + "rewards/rejected": -42.33381652832031, + "step": 187 + }, + { + "epoch": 1.34, + "grad_norm": 4.347159707384699e-05, + "learning_rate": 3.8153661521931215e-07, + "logits/chosen": -2.605191707611084, + "logits/rejected": -2.563929796218872, + "logps/chosen": -281.89654541015625, + "logps/rejected": -763.271728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9085638523101807, + "rewards/margins": 45.944114685058594, + "rewards/rejected": -49.85268020629883, + "step": 188 + }, + { + "epoch": 1.34, + "grad_norm": 0.007198807585566107, + "learning_rate": 3.8032011627237097e-07, + "logits/chosen": -2.626549243927002, + "logits/rejected": -2.5736947059631348, + "logps/chosen": -286.00433349609375, + "logps/rejected": -821.015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2450716495513916, + "rewards/margins": 51.89350128173828, + "rewards/rejected": -55.138572692871094, + "step": 189 + }, + { + "epoch": 1.35, + "grad_norm": 0.0012887457644015477, + "learning_rate": 3.7909936540974046e-07, + "logits/chosen": -2.541388988494873, + "logits/rejected": -2.5730860233306885, + "logps/chosen": -164.2898712158203, + "logps/rejected": -646.9137573242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6139147281646729, + "rewards/margins": 41.77546691894531, + "rewards/rejected": -43.389381408691406, + "step": 190 + }, + { + "epoch": 1.36, + "grad_norm": 0.009778285901878379, + "learning_rate": 3.77874402460496e-07, + "logits/chosen": -2.5973589420318604, + "logits/rejected": -2.549447536468506, + "logps/chosen": -260.55322265625, + "logps/rejected": -676.940185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.399229526519775, + "rewards/margins": 36.91831970214844, + "rewards/rejected": -41.31755065917969, + "step": 191 + }, + { + "epoch": 1.36, + "grad_norm": 0.0008509337634688795, + "learning_rate": 3.7664526739113955e-07, + "logits/chosen": -2.6228926181793213, + "logits/rejected": -2.5585670471191406, + "logps/chosen": -239.9962158203125, + "logps/rejected": -656.2476806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.590725898742676, + "rewards/margins": 39.48649597167969, + "rewards/rejected": -42.07722473144531, + "step": 192 + }, + { + "epoch": 1.37, + "grad_norm": 5.2809397657515845e-05, + "learning_rate": 3.7541200030429563e-07, + "logits/chosen": -2.6130588054656982, + "logits/rejected": -2.584519386291504, + "logps/chosen": -219.84759521484375, + "logps/rejected": -667.1642456054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9688729047775269, + "rewards/margins": 44.62300491333008, + "rewards/rejected": -45.59187316894531, + "step": 193 + }, + { + "epoch": 1.38, + "grad_norm": 0.0005052904326579023, + "learning_rate": 3.741746414374028e-07, + "logits/chosen": -2.5732181072235107, + "logits/rejected": -2.5817766189575195, + "logps/chosen": -245.62635803222656, + "logps/rejected": -689.6527099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3551692962646484, + "rewards/margins": 39.688385009765625, + "rewards/rejected": -43.043556213378906, + "step": 194 + }, + { + "epoch": 1.39, + "grad_norm": 1.7955003506877755e-06, + "learning_rate": 3.72933231161401e-07, + "logits/chosen": -2.5767765045166016, + "logits/rejected": -2.526268243789673, + "logps/chosen": -247.6514892578125, + "logps/rejected": -704.3443603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.938252329826355, + "rewards/margins": 44.26829147338867, + "rewards/rejected": -46.20654296875, + "step": 195 + }, + { + "epoch": 1.39, + "grad_norm": 0.000857666222864142, + "learning_rate": 3.716878099794141e-07, + "logits/chosen": -2.5939157009124756, + "logits/rejected": -2.518718957901001, + "logps/chosen": -210.98370361328125, + "logps/rejected": -637.1238403320312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6101771593093872, + "rewards/margins": 43.85637664794922, + "rewards/rejected": -45.466552734375, + "step": 196 + }, + { + "epoch": 1.4, + "grad_norm": 9.986946359195026e-06, + "learning_rate": 3.704384185254288e-07, + "logits/chosen": -2.5751476287841797, + "logits/rejected": -2.601004123687744, + "logps/chosen": -168.38536071777344, + "logps/rejected": -630.9132080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.283494472503662, + "rewards/margins": 37.38148498535156, + "rewards/rejected": -39.664981842041016, + "step": 197 + }, + { + "epoch": 1.41, + "grad_norm": 0.0007390563720859438, + "learning_rate": 3.6918509756296874e-07, + "logits/chosen": -2.5324323177337646, + "logits/rejected": -2.575556755065918, + "logps/chosen": -228.33265686035156, + "logps/rejected": -766.3194580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23001301288604736, + "rewards/margins": 49.787471771240234, + "rewards/rejected": -50.017486572265625, + "step": 198 + }, + { + "epoch": 1.41, + "grad_norm": 0.000288712678118981, + "learning_rate": 3.679278879837642e-07, + "logits/chosen": -2.5812995433807373, + "logits/rejected": -2.6321942806243896, + "logps/chosen": -217.1759033203125, + "logps/rejected": -695.9140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.520460605621338, + "rewards/margins": 40.43023681640625, + "rewards/rejected": -45.95069885253906, + "step": 199 + }, + { + "epoch": 1.42, + "grad_norm": 0.0012231022713329845, + "learning_rate": 3.6666683080641843e-07, + "logits/chosen": -2.58329439163208, + "logits/rejected": -2.5853867530822754, + "logps/chosen": -215.34913635253906, + "logps/rejected": -611.637451171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.439545154571533, + "rewards/margins": 34.227867126464844, + "rewards/rejected": -36.66741180419922, + "step": 200 + }, + { + "epoch": 1.43, + "grad_norm": 0.000733759350414321, + "learning_rate": 3.6540196717506897e-07, + "logits/chosen": -2.6083312034606934, + "logits/rejected": -2.585056781768799, + "logps/chosen": -248.43228149414062, + "logps/rejected": -672.314697265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.367678165435791, + "rewards/margins": 39.240882873535156, + "rewards/rejected": -41.608558654785156, + "step": 201 + }, + { + "epoch": 1.44, + "grad_norm": 8.422228178581671e-05, + "learning_rate": 3.641333383580456e-07, + "logits/chosen": -2.5543086528778076, + "logits/rejected": -2.5537734031677246, + "logps/chosen": -226.57147216796875, + "logps/rejected": -702.8181762695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0970239639282227, + "rewards/margins": 45.87911605834961, + "rewards/rejected": -46.976139068603516, + "step": 202 + }, + { + "epoch": 1.44, + "grad_norm": 0.002539476763684861, + "learning_rate": 3.628609857465235e-07, + "logits/chosen": -2.6343159675598145, + "logits/rejected": -2.5662364959716797, + "logps/chosen": -276.103271484375, + "logps/rejected": -709.9378662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4413604736328125, + "rewards/margins": 34.88125991821289, + "rewards/rejected": -38.32261657714844, + "step": 203 + }, + { + "epoch": 1.45, + "grad_norm": 0.000880276924259783, + "learning_rate": 3.61584950853173e-07, + "logits/chosen": -2.6310958862304688, + "logits/rejected": -2.5548181533813477, + "logps/chosen": -239.91468811035156, + "logps/rejected": -634.9144287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.289881706237793, + "rewards/margins": 39.551979064941406, + "rewards/rejected": -42.84186553955078, + "step": 204 + }, + { + "epoch": 1.46, + "grad_norm": 3.657139677374002e-05, + "learning_rate": 3.603052753108053e-07, + "logits/chosen": -2.6221022605895996, + "logits/rejected": -2.631523609161377, + "logps/chosen": -207.55445861816406, + "logps/rejected": -802.033203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.490554690361023, + "rewards/margins": 54.03175735473633, + "rewards/rejected": -55.52231216430664, + "step": 205 + }, + { + "epoch": 1.46, + "grad_norm": 2.0119881146273305e-05, + "learning_rate": 3.590220008710138e-07, + "logits/chosen": -2.5629162788391113, + "logits/rejected": -2.5257961750030518, + "logps/chosen": -236.7938995361328, + "logps/rejected": -738.3856201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4539563655853271, + "rewards/margins": 45.72328567504883, + "rewards/rejected": -47.17724609375, + "step": 206 + }, + { + "epoch": 1.47, + "grad_norm": 7.981162937757085e-05, + "learning_rate": 3.577351694028122e-07, + "logits/chosen": -2.591754674911499, + "logits/rejected": -2.526224374771118, + "logps/chosen": -198.39166259765625, + "logps/rejected": -641.675048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.994654893875122, + "rewards/margins": 42.713523864746094, + "rewards/rejected": -44.70817947387695, + "step": 207 + }, + { + "epoch": 1.48, + "grad_norm": 0.0005271066555757389, + "learning_rate": 3.5644482289126813e-07, + "logits/chosen": -2.564438581466675, + "logits/rejected": -2.5452938079833984, + "logps/chosen": -252.55999755859375, + "logps/rejected": -691.1210327148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9640474319458008, + "rewards/margins": 39.98043441772461, + "rewards/rejected": -41.944480895996094, + "step": 208 + }, + { + "epoch": 1.48, + "grad_norm": 0.00020356656142167604, + "learning_rate": 3.551510034361337e-07, + "logits/chosen": -2.6246352195739746, + "logits/rejected": -2.572950839996338, + "logps/chosen": -209.00924682617188, + "logps/rejected": -667.8258666992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7787586450576782, + "rewards/margins": 41.19306182861328, + "rewards/rejected": -42.97181701660156, + "step": 209 + }, + { + "epoch": 1.49, + "grad_norm": 0.000350105947478326, + "learning_rate": 3.5385375325047163e-07, + "logits/chosen": -2.6300346851348877, + "logits/rejected": -2.5822620391845703, + "logps/chosen": -238.00901794433594, + "logps/rejected": -673.64501953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7844257354736328, + "rewards/margins": 41.155094146728516, + "rewards/rejected": -42.939517974853516, + "step": 210 + }, + { + "epoch": 1.5, + "grad_norm": 4.6067886192834765e-07, + "learning_rate": 3.5255311465927797e-07, + "logits/chosen": -2.616466760635376, + "logits/rejected": -2.589303970336914, + "logps/chosen": -207.48519897460938, + "logps/rejected": -704.46875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1980453729629517, + "rewards/margins": 47.832462310791016, + "rewards/rejected": -49.03050994873047, + "step": 211 + }, + { + "epoch": 1.51, + "grad_norm": 7.578543212995715e-07, + "learning_rate": 3.512491300981013e-07, + "logits/chosen": -2.633970260620117, + "logits/rejected": -2.5496675968170166, + "logps/chosen": -253.0145263671875, + "logps/rejected": -636.4385375976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0515022277832031, + "rewards/margins": 42.21809387207031, + "rewards/rejected": -43.269596099853516, + "step": 212 + }, + { + "epoch": 1.51, + "grad_norm": 1.2262496979898977e-06, + "learning_rate": 3.4994184211165846e-07, + "logits/chosen": -2.634376287460327, + "logits/rejected": -2.5898663997650146, + "logps/chosen": -239.5780487060547, + "logps/rejected": -720.5720825195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5971877574920654, + "rewards/margins": 45.90214538574219, + "rewards/rejected": -49.49933624267578, + "step": 213 + }, + { + "epoch": 1.52, + "grad_norm": 8.399516678718336e-05, + "learning_rate": 3.486312933524457e-07, + "logits/chosen": -2.596825122833252, + "logits/rejected": -2.6391563415527344, + "logps/chosen": -184.0288543701172, + "logps/rejected": -750.6767578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.778039813041687, + "rewards/margins": 49.85633087158203, + "rewards/rejected": -50.634368896484375, + "step": 214 + }, + { + "epoch": 1.53, + "grad_norm": 0.00028018477243698894, + "learning_rate": 3.4731752657934787e-07, + "logits/chosen": -2.6160709857940674, + "logits/rejected": -2.5621070861816406, + "logps/chosen": -215.84976196289062, + "logps/rejected": -679.6697998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6129071712493896, + "rewards/margins": 43.58246994018555, + "rewards/rejected": -45.19538116455078, + "step": 215 + }, + { + "epoch": 1.53, + "grad_norm": 5.977015780789322e-06, + "learning_rate": 3.460005846562428e-07, + "logits/chosen": -2.605872869491577, + "logits/rejected": -2.546614408493042, + "logps/chosen": -195.818115234375, + "logps/rejected": -586.410400390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2920961380004883, + "rewards/margins": 38.032108306884766, + "rewards/rejected": -41.32420349121094, + "step": 216 + }, + { + "epoch": 1.54, + "grad_norm": 3.153007003218233e-05, + "learning_rate": 3.446805105506033e-07, + "logits/chosen": -2.61542010307312, + "logits/rejected": -2.5457875728607178, + "logps/chosen": -221.92608642578125, + "logps/rejected": -705.8463134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7586467266082764, + "rewards/margins": 45.22154998779297, + "rewards/rejected": -46.98019790649414, + "step": 217 + }, + { + "epoch": 1.55, + "grad_norm": 0.0022851106931792886, + "learning_rate": 3.4335734733209455e-07, + "logits/chosen": -2.5730552673339844, + "logits/rejected": -2.5667812824249268, + "logps/chosen": -223.39761352539062, + "logps/rejected": -705.6068115234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8217731714248657, + "rewards/margins": 43.927345275878906, + "rewards/rejected": -45.749114990234375, + "step": 218 + }, + { + "epoch": 1.56, + "grad_norm": 0.006519475826514208, + "learning_rate": 3.4203113817116953e-07, + "logits/chosen": -2.589329242706299, + "logits/rejected": -2.6015141010284424, + "logps/chosen": -160.32980346679688, + "logps/rejected": -569.9908447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.078522205352783, + "rewards/margins": 37.706504821777344, + "rewards/rejected": -39.78502655029297, + "step": 219 + }, + { + "epoch": 1.56, + "grad_norm": 0.006288728421870319, + "learning_rate": 3.407019263376602e-07, + "logits/chosen": -2.5994086265563965, + "logits/rejected": -2.5762906074523926, + "logps/chosen": -214.80221557617188, + "logps/rejected": -699.91650390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3636231422424316, + "rewards/margins": 43.48625564575195, + "rewards/rejected": -46.84988021850586, + "step": 220 + }, + { + "epoch": 1.57, + "grad_norm": 1.8209229370515527, + "learning_rate": 3.393697551993661e-07, + "logits/chosen": -2.646888256072998, + "logits/rejected": -2.579021453857422, + "logps/chosen": -239.33230590820312, + "logps/rejected": -762.656005859375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8032903671264648, + "rewards/margins": 54.07151794433594, + "rewards/rejected": -55.87480926513672, + "step": 221 + }, + { + "epoch": 1.58, + "grad_norm": 0.0009005144690069314, + "learning_rate": 3.3803466822063875e-07, + "logits/chosen": -2.5617916584014893, + "logits/rejected": -2.589597702026367, + "logps/chosen": -221.90574645996094, + "logps/rejected": -697.5983276367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7422846555709839, + "rewards/margins": 45.80995559692383, + "rewards/rejected": -47.552242279052734, + "step": 222 + }, + { + "epoch": 1.58, + "grad_norm": 0.03220941788758961, + "learning_rate": 3.3669670896096403e-07, + "logits/chosen": -2.608153820037842, + "logits/rejected": -2.6688108444213867, + "logps/chosen": -200.77297973632812, + "logps/rejected": -661.3016357421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3855538368225098, + "rewards/margins": 40.689208984375, + "rewards/rejected": -44.07476806640625, + "step": 223 + }, + { + "epoch": 1.59, + "grad_norm": 0.00016402611228167382, + "learning_rate": 3.35355921073541e-07, + "logits/chosen": -2.638033151626587, + "logits/rejected": -2.608006715774536, + "logps/chosen": -225.67538452148438, + "logps/rejected": -707.5538330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8232722282409668, + "rewards/margins": 45.372047424316406, + "rewards/rejected": -47.19532012939453, + "step": 224 + }, + { + "epoch": 1.6, + "grad_norm": 0.010001036289766002, + "learning_rate": 3.340123483038575e-07, + "logits/chosen": -2.5980443954467773, + "logits/rejected": -2.6389718055725098, + "logps/chosen": -211.3531494140625, + "logps/rejected": -733.5176391601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8272793292999268, + "rewards/margins": 46.7315673828125, + "rewards/rejected": -49.558841705322266, + "step": 225 + }, + { + "epoch": 1.61, + "grad_norm": 0.00019735494555091413, + "learning_rate": 3.326660344882628e-07, + "logits/chosen": -2.603639602661133, + "logits/rejected": -2.627902030944824, + "logps/chosen": -240.2572021484375, + "logps/rejected": -746.4942016601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6755309104919434, + "rewards/margins": 46.23228073120117, + "rewards/rejected": -49.907814025878906, + "step": 226 + }, + { + "epoch": 1.61, + "grad_norm": 0.34730438950674475, + "learning_rate": 3.313170235525374e-07, + "logits/chosen": -2.6512913703918457, + "logits/rejected": -2.5727484226226807, + "logps/chosen": -239.92086791992188, + "logps/rejected": -666.4832763671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.958973407745361, + "rewards/margins": 43.2906494140625, + "rewards/rejected": -48.24962615966797, + "step": 227 + }, + { + "epoch": 1.62, + "grad_norm": 0.0001011366460316848, + "learning_rate": 3.299653595104602e-07, + "logits/chosen": -2.646182060241699, + "logits/rejected": -2.627742290496826, + "logps/chosen": -201.85833740234375, + "logps/rejected": -740.8365478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9788103103637695, + "rewards/margins": 49.82157516479492, + "rewards/rejected": -51.800384521484375, + "step": 228 + }, + { + "epoch": 1.63, + "grad_norm": 4.333899880039296e-05, + "learning_rate": 3.28611086462372e-07, + "logits/chosen": -2.619448661804199, + "logits/rejected": -2.6020002365112305, + "logps/chosen": -213.89462280273438, + "logps/rejected": -755.117919921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.423417806625366, + "rewards/margins": 49.50520324707031, + "rewards/rejected": -52.928627014160156, + "step": 229 + }, + { + "epoch": 1.63, + "grad_norm": 1.1133991602376578e-05, + "learning_rate": 3.272542485937368e-07, + "logits/chosen": -2.6537086963653564, + "logits/rejected": -2.5845842361450195, + "logps/chosen": -241.1490020751953, + "logps/rejected": -744.0867919921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.898337364196777, + "rewards/margins": 45.04263687133789, + "rewards/rejected": -49.94097137451172, + "step": 230 + }, + { + "epoch": 1.64, + "grad_norm": 0.0004944144634150385, + "learning_rate": 3.2589489017370054e-07, + "logits/chosen": -2.663158893585205, + "logits/rejected": -2.572969913482666, + "logps/chosen": -234.42543029785156, + "logps/rejected": -680.1776733398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.120585918426514, + "rewards/margins": 41.46262741088867, + "rewards/rejected": -45.583213806152344, + "step": 231 + }, + { + "epoch": 1.65, + "grad_norm": 0.06129388419000637, + "learning_rate": 3.245330555536461e-07, + "logits/chosen": -2.6757800579071045, + "logits/rejected": -2.6678802967071533, + "logps/chosen": -268.8004150390625, + "logps/rejected": -816.198974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3049869537353516, + "rewards/margins": 49.955360412597656, + "rewards/rejected": -52.260353088378906, + "step": 232 + }, + { + "epoch": 1.66, + "grad_norm": 0.013546860788437833, + "learning_rate": 3.231687891657469e-07, + "logits/chosen": -2.591245651245117, + "logits/rejected": -2.5924582481384277, + "logps/chosen": -250.55459594726562, + "logps/rejected": -713.7130126953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6036484241485596, + "rewards/margins": 45.84377670288086, + "rewards/rejected": -49.44742202758789, + "step": 233 + }, + { + "epoch": 1.66, + "grad_norm": 0.06126616571887995, + "learning_rate": 3.218021355215166e-07, + "logits/chosen": -2.588942766189575, + "logits/rejected": -2.6237874031066895, + "logps/chosen": -192.82749938964844, + "logps/rejected": -605.7573852539062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.212188243865967, + "rewards/margins": 38.424991607666016, + "rewards/rejected": -43.637176513671875, + "step": 234 + }, + { + "epoch": 1.67, + "grad_norm": 0.0019625399126593657, + "learning_rate": 3.204331392103574e-07, + "logits/chosen": -2.6118078231811523, + "logits/rejected": -2.6174445152282715, + "logps/chosen": -276.7481994628906, + "logps/rejected": -849.659912109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.768052101135254, + "rewards/margins": 55.55689239501953, + "rewards/rejected": -60.32494354248047, + "step": 235 + }, + { + "epoch": 1.68, + "grad_norm": 5.002571129095978e-06, + "learning_rate": 3.190618448981051e-07, + "logits/chosen": -2.702798366546631, + "logits/rejected": -2.6287076473236084, + "logps/chosen": -254.6178436279297, + "logps/rejected": -802.20361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.161823749542236, + "rewards/margins": 52.920555114746094, + "rewards/rejected": -57.08237838745117, + "step": 236 + }, + { + "epoch": 1.68, + "grad_norm": 7.12052632435535e-05, + "learning_rate": 3.1768829732557135e-07, + "logits/chosen": -2.6606497764587402, + "logits/rejected": -2.6363344192504883, + "logps/chosen": -227.60289001464844, + "logps/rejected": -806.2652587890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3978519439697266, + "rewards/margins": 54.34749984741211, + "rewards/rejected": -56.74535369873047, + "step": 237 + }, + { + "epoch": 1.69, + "grad_norm": 0.0010937965577921605, + "learning_rate": 3.163125413070844e-07, + "logits/chosen": -2.5996205806732178, + "logits/rejected": -2.5114541053771973, + "logps/chosen": -215.44503784179688, + "logps/rejected": -651.8919677734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8539767265319824, + "rewards/margins": 41.235069274902344, + "rewards/rejected": -43.08904266357422, + "step": 238 + }, + { + "epoch": 1.7, + "grad_norm": 1.3155577633479938e-05, + "learning_rate": 3.149346217290271e-07, + "logits/chosen": -2.621655225753784, + "logits/rejected": -2.588082790374756, + "logps/chosen": -259.9637145996094, + "logps/rejected": -720.2318115234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.471480369567871, + "rewards/margins": 45.632911682128906, + "rewards/rejected": -50.104393005371094, + "step": 239 + }, + { + "epoch": 1.71, + "grad_norm": 0.00610941607313684, + "learning_rate": 3.135545835483718e-07, + "logits/chosen": -2.6089236736297607, + "logits/rejected": -2.587430715560913, + "logps/chosen": -255.17062377929688, + "logps/rejected": -766.4443969726562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.38810396194458, + "rewards/margins": 45.594322204589844, + "rewards/rejected": -50.982425689697266, + "step": 240 + }, + { + "epoch": 1.71, + "grad_norm": 0.0007781666817554977, + "learning_rate": 3.121724717912138e-07, + "logits/chosen": -2.67773699760437, + "logits/rejected": -2.6346096992492676, + "logps/chosen": -254.72076416015625, + "logps/rejected": -805.816650390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.627288341522217, + "rewards/margins": 53.54477310180664, + "rewards/rejected": -56.172061920166016, + "step": 241 + }, + { + "epoch": 1.72, + "grad_norm": 4.37272231895179e-06, + "learning_rate": 3.1078833155130243e-07, + "logits/chosen": -2.6342856884002686, + "logits/rejected": -2.5860724449157715, + "logps/chosen": -279.3261413574219, + "logps/rejected": -802.7673950195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1220903396606445, + "rewards/margins": 48.382057189941406, + "rewards/rejected": -52.504150390625, + "step": 242 + }, + { + "epoch": 1.73, + "grad_norm": 5.3337321744518534e-06, + "learning_rate": 3.0940220798857e-07, + "logits/chosen": -2.6475141048431396, + "logits/rejected": -2.6263465881347656, + "logps/chosen": -250.61160278320312, + "logps/rejected": -771.9094848632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.81309175491333, + "rewards/margins": 48.76592254638672, + "rewards/rejected": -53.57900619506836, + "step": 243 + }, + { + "epoch": 1.73, + "grad_norm": 0.006141050550734622, + "learning_rate": 3.080141463276579e-07, + "logits/chosen": -2.5699822902679443, + "logits/rejected": -2.573237419128418, + "logps/chosen": -263.64251708984375, + "logps/rejected": -792.9046630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.136805057525635, + "rewards/margins": 48.80036926269531, + "rewards/rejected": -53.93717575073242, + "step": 244 + }, + { + "epoch": 1.74, + "grad_norm": 2.6335443441550503e-05, + "learning_rate": 3.0662419185644114e-07, + "logits/chosen": -2.6054391860961914, + "logits/rejected": -2.5965652465820312, + "logps/chosen": -241.22071838378906, + "logps/rejected": -766.3815307617188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.681996822357178, + "rewards/margins": 49.40990447998047, + "rewards/rejected": -54.09189987182617, + "step": 245 + }, + { + "epoch": 1.75, + "grad_norm": 2.0432760062069497, + "learning_rate": 3.0523238992455104e-07, + "logits/chosen": -2.6749606132507324, + "logits/rejected": -2.5949466228485107, + "logps/chosen": -281.06781005859375, + "logps/rejected": -775.1441040039062, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.159917831420898, + "rewards/margins": 48.37465286254883, + "rewards/rejected": -53.53456497192383, + "step": 246 + }, + { + "epoch": 1.75, + "grad_norm": 5.446978339663144e-06, + "learning_rate": 3.038387859418955e-07, + "logits/chosen": -2.6239516735076904, + "logits/rejected": -2.564690589904785, + "logps/chosen": -281.8948669433594, + "logps/rejected": -761.5697021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.757047176361084, + "rewards/margins": 42.52853775024414, + "rewards/rejected": -48.285587310791016, + "step": 247 + }, + { + "epoch": 1.76, + "grad_norm": 0.00042802034863944895, + "learning_rate": 3.024434253771773e-07, + "logits/chosen": -2.6605594158172607, + "logits/rejected": -2.59136700630188, + "logps/chosen": -284.0221252441406, + "logps/rejected": -781.1024169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.681607961654663, + "rewards/margins": 47.56032943725586, + "rewards/rejected": -50.24193572998047, + "step": 248 + }, + { + "epoch": 1.77, + "grad_norm": 3.7320314436892344e-05, + "learning_rate": 3.010463537564108e-07, + "logits/chosen": -2.6689367294311523, + "logits/rejected": -2.5771164894104004, + "logps/chosen": -277.67987060546875, + "logps/rejected": -801.4888916015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3985414505004883, + "rewards/margins": 51.7540283203125, + "rewards/rejected": -55.15257263183594, + "step": 249 + }, + { + "epoch": 1.78, + "grad_norm": 0.00159187072056745, + "learning_rate": 2.996476166614363e-07, + "logits/chosen": -2.702686071395874, + "logits/rejected": -2.6569106578826904, + "logps/chosen": -237.650634765625, + "logps/rejected": -729.841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.724506139755249, + "rewards/margins": 43.79349899291992, + "rewards/rejected": -46.518001556396484, + "step": 250 + }, + { + "epoch": 1.78, + "grad_norm": 0.0001677942293223587, + "learning_rate": 2.982472597284334e-07, + "logits/chosen": -2.605083465576172, + "logits/rejected": -2.506571054458618, + "logps/chosen": -264.5461120605469, + "logps/rejected": -649.6758422851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.750299096107483, + "rewards/margins": 41.120506286621094, + "rewards/rejected": -42.87080383300781, + "step": 251 + }, + { + "epoch": 1.79, + "grad_norm": 2.1421941505892814e-05, + "learning_rate": 2.968453286464312e-07, + "logits/chosen": -2.647456169128418, + "logits/rejected": -2.610846757888794, + "logps/chosen": -261.5907897949219, + "logps/rejected": -771.17578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1900571584701538, + "rewards/margins": 45.82322692871094, + "rewards/rejected": -47.01328659057617, + "step": 252 + }, + { + "epoch": 1.8, + "grad_norm": 0.00015590373196866202, + "learning_rate": 2.9544186915581834e-07, + "logits/chosen": -2.6333425045013428, + "logits/rejected": -2.5818209648132324, + "logps/chosen": -249.65403747558594, + "logps/rejected": -702.033203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0030667781829834, + "rewards/margins": 39.55420684814453, + "rewards/rejected": -41.557273864746094, + "step": 253 + }, + { + "epoch": 1.8, + "grad_norm": 0.0008043521892293433, + "learning_rate": 2.9403692704685037e-07, + "logits/chosen": -2.6357340812683105, + "logits/rejected": -2.529351234436035, + "logps/chosen": -261.9450378417969, + "logps/rejected": -638.3784790039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3327512741088867, + "rewards/margins": 39.044715881347656, + "rewards/rejected": -41.377464294433594, + "step": 254 + }, + { + "epoch": 1.81, + "grad_norm": 0.0004945163263501975, + "learning_rate": 2.9263054815815595e-07, + "logits/chosen": -2.6310646533966064, + "logits/rejected": -2.6107215881347656, + "logps/chosen": -192.73980712890625, + "logps/rejected": -628.2189331054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6959128379821777, + "rewards/margins": 37.87361526489258, + "rewards/rejected": -40.56952667236328, + "step": 255 + }, + { + "epoch": 1.82, + "grad_norm": 0.00015301559540641064, + "learning_rate": 2.9122277837524084e-07, + "logits/chosen": -2.665829658508301, + "logits/rejected": -2.623096466064453, + "logps/chosen": -180.1748809814453, + "logps/rejected": -652.5894775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5211044549942017, + "rewards/margins": 41.80388641357422, + "rewards/rejected": -43.32499694824219, + "step": 256 + }, + { + "epoch": 1.83, + "grad_norm": 9.684402065279022e-05, + "learning_rate": 2.8981366362899113e-07, + "logits/chosen": -2.6399521827697754, + "logits/rejected": -2.604511260986328, + "logps/chosen": -253.05758666992188, + "logps/rejected": -746.2796630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4746952056884766, + "rewards/margins": 46.2356071472168, + "rewards/rejected": -49.71030044555664, + "step": 257 + }, + { + "epoch": 1.83, + "grad_norm": 6.100681113349319e-06, + "learning_rate": 2.884032498941749e-07, + "logits/chosen": -2.634425640106201, + "logits/rejected": -2.5927858352661133, + "logps/chosen": -288.1816711425781, + "logps/rejected": -758.2382202148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0677196979522705, + "rewards/margins": 45.65802001953125, + "rewards/rejected": -48.725738525390625, + "step": 258 + }, + { + "epoch": 1.84, + "grad_norm": 7.610088736541747e-05, + "learning_rate": 2.8699158318794166e-07, + "logits/chosen": -2.6393537521362305, + "logits/rejected": -2.626939058303833, + "logps/chosen": -251.40280151367188, + "logps/rejected": -755.636962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1281187534332275, + "rewards/margins": 43.45673370361328, + "rewards/rejected": -44.58485412597656, + "step": 259 + }, + { + "epoch": 1.85, + "grad_norm": 0.0030858557566510627, + "learning_rate": 2.8557870956832133e-07, + "logits/chosen": -2.6407923698425293, + "logits/rejected": -2.5685319900512695, + "logps/chosen": -267.18878173828125, + "logps/rejected": -699.4261474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6882833242416382, + "rewards/margins": 40.82693862915039, + "rewards/rejected": -42.515220642089844, + "step": 260 + }, + { + "epoch": 1.85, + "grad_norm": 0.002631002996385207, + "learning_rate": 2.841646751327214e-07, + "logits/chosen": -2.6571507453918457, + "logits/rejected": -2.655595541000366, + "logps/chosen": -246.01202392578125, + "logps/rejected": -697.6840209960938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.316676616668701, + "rewards/margins": 43.954627990722656, + "rewards/rejected": -46.271305084228516, + "step": 261 + }, + { + "epoch": 1.86, + "grad_norm": 0.049924761108460634, + "learning_rate": 2.827495260164232e-07, + "logits/chosen": -2.6571033000946045, + "logits/rejected": -2.5742335319519043, + "logps/chosen": -230.77938842773438, + "logps/rejected": -669.7883911132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1973390579223633, + "rewards/margins": 41.00544738769531, + "rewards/rejected": -44.202789306640625, + "step": 262 + }, + { + "epoch": 1.87, + "grad_norm": 0.003005862420586273, + "learning_rate": 2.8133330839107604e-07, + "logits/chosen": -2.5991525650024414, + "logits/rejected": -2.514803647994995, + "logps/chosen": -217.24435424804688, + "logps/rejected": -637.5347290039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6602402925491333, + "rewards/margins": 40.367576599121094, + "rewards/rejected": -42.02781677246094, + "step": 263 + }, + { + "epoch": 1.88, + "grad_norm": 0.010664533475038132, + "learning_rate": 2.7991606846319146e-07, + "logits/chosen": -2.6892874240875244, + "logits/rejected": -2.624999523162842, + "logps/chosen": -261.3077392578125, + "logps/rejected": -642.9027099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9962120056152344, + "rewards/margins": 38.43465042114258, + "rewards/rejected": -40.43086242675781, + "step": 264 + }, + { + "epoch": 1.88, + "grad_norm": 0.0010797100202551814, + "learning_rate": 2.784978524726351e-07, + "logits/chosen": -2.622413158416748, + "logits/rejected": -2.6508889198303223, + "logps/chosen": -217.48316955566406, + "logps/rejected": -767.8394775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3538990020751953, + "rewards/margins": 47.93068313598633, + "rewards/rejected": -49.284576416015625, + "step": 265 + }, + { + "epoch": 1.89, + "grad_norm": 0.0007242841691088611, + "learning_rate": 2.7707870669111865e-07, + "logits/chosen": -2.5968880653381348, + "logits/rejected": -2.5095863342285156, + "logps/chosen": -241.19483947753906, + "logps/rejected": -640.6085205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3062682151794434, + "rewards/margins": 39.96379470825195, + "rewards/rejected": -42.27006149291992, + "step": 266 + }, + { + "epoch": 1.9, + "grad_norm": 8.567957703414913e-05, + "learning_rate": 2.7565867742068943e-07, + "logits/chosen": -2.621870756149292, + "logits/rejected": -2.6350061893463135, + "logps/chosen": -143.51182556152344, + "logps/rejected": -652.7653198242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7991840839385986, + "rewards/margins": 43.602108001708984, + "rewards/rejected": -45.40129089355469, + "step": 267 + }, + { + "epoch": 1.9, + "grad_norm": 4.4761937159799064e-05, + "learning_rate": 2.7423781099222037e-07, + "logits/chosen": -2.6535634994506836, + "logits/rejected": -2.649076461791992, + "logps/chosen": -180.53128051757812, + "logps/rejected": -716.30078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6550968289375305, + "rewards/margins": 47.31419372558594, + "rewards/rejected": -47.96928405761719, + "step": 268 + }, + { + "epoch": 1.91, + "grad_norm": 0.005521136281896967, + "learning_rate": 2.7281615376389793e-07, + "logits/chosen": -2.6598236560821533, + "logits/rejected": -2.5527379512786865, + "logps/chosen": -221.47677612304688, + "logps/rejected": -671.9553833007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6755762100219727, + "rewards/margins": 43.98118591308594, + "rewards/rejected": -45.656761169433594, + "step": 269 + }, + { + "epoch": 1.92, + "grad_norm": 0.0006070599004463767, + "learning_rate": 2.7139375211970995e-07, + "logits/chosen": -2.6026594638824463, + "logits/rejected": -2.574418067932129, + "logps/chosen": -215.58242797851562, + "logps/rejected": -682.86328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3032331466674805, + "rewards/margins": 43.540870666503906, + "rewards/rejected": -46.84410858154297, + "step": 270 + }, + { + "epoch": 1.93, + "grad_norm": 3.672659906419345e-05, + "learning_rate": 2.699706524679319e-07, + "logits/chosen": -2.695810556411743, + "logits/rejected": -2.631338119506836, + "logps/chosen": -279.63714599609375, + "logps/rejected": -696.0635375976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9298660159111023, + "rewards/margins": 43.126502990722656, + "rewards/rejected": -44.056365966796875, + "step": 271 + }, + { + "epoch": 1.93, + "grad_norm": 3.8478870729048305e-05, + "learning_rate": 2.6854690123961306e-07, + "logits/chosen": -2.650444269180298, + "logits/rejected": -2.6091744899749756, + "logps/chosen": -185.26889038085938, + "logps/rejected": -613.1510009765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8187310695648193, + "rewards/margins": 39.30827331542969, + "rewards/rejected": -42.12700653076172, + "step": 272 + }, + { + "epoch": 1.94, + "grad_norm": 0.0033409658405874065, + "learning_rate": 2.671225448870614e-07, + "logits/chosen": -2.603532314300537, + "logits/rejected": -2.5760154724121094, + "logps/chosen": -212.02584838867188, + "logps/rejected": -620.2386474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9137308597564697, + "rewards/margins": 37.46217346191406, + "rewards/rejected": -40.37590408325195, + "step": 273 + }, + { + "epoch": 1.95, + "grad_norm": 0.0005820394807695717, + "learning_rate": 2.6569762988232837e-07, + "logits/chosen": -2.6298093795776367, + "logits/rejected": -2.5624773502349854, + "logps/chosen": -228.9441375732422, + "logps/rejected": -632.8128662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.203054189682007, + "rewards/margins": 39.316898345947266, + "rewards/rejected": -41.51995086669922, + "step": 274 + }, + { + "epoch": 1.95, + "grad_norm": 0.0065554193874592945, + "learning_rate": 2.64272202715692e-07, + "logits/chosen": -2.595012664794922, + "logits/rejected": -2.592442035675049, + "logps/chosen": -205.44903564453125, + "logps/rejected": -652.097412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8901445865631104, + "rewards/margins": 39.876644134521484, + "rewards/rejected": -40.766788482666016, + "step": 275 + }, + { + "epoch": 1.96, + "grad_norm": 0.00016934721179843865, + "learning_rate": 2.6284630989414074e-07, + "logits/chosen": -2.6175105571746826, + "logits/rejected": -2.628140926361084, + "logps/chosen": -206.4010772705078, + "logps/rejected": -708.7982788085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3803884983062744, + "rewards/margins": 41.5196647644043, + "rewards/rejected": -43.90005111694336, + "step": 276 + }, + { + "epoch": 1.97, + "grad_norm": 0.0002480345644617611, + "learning_rate": 2.614199979398558e-07, + "logits/chosen": -2.673520088195801, + "logits/rejected": -2.6279067993164062, + "logps/chosen": -272.35662841796875, + "logps/rejected": -735.37744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.564852237701416, + "rewards/margins": 42.00334167480469, + "rewards/rejected": -44.56819152832031, + "step": 277 + }, + { + "epoch": 1.98, + "grad_norm": 0.0001109960070996992, + "learning_rate": 2.5999331338869335e-07, + "logits/chosen": -2.569504499435425, + "logits/rejected": -2.548218011856079, + "logps/chosen": -201.81539916992188, + "logps/rejected": -581.187744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.687114953994751, + "rewards/margins": 33.24966049194336, + "rewards/rejected": -34.93677520751953, + "step": 278 + }, + { + "epoch": 1.98, + "grad_norm": 0.012903775884326101, + "learning_rate": 2.5856630278866586e-07, + "logits/chosen": -2.579911947250366, + "logits/rejected": -2.5501151084899902, + "logps/chosen": -216.35986328125, + "logps/rejected": -767.426513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9628387689590454, + "rewards/margins": 50.72027587890625, + "rewards/rejected": -51.6831169128418, + "step": 279 + }, + { + "epoch": 1.99, + "grad_norm": 2.800172551396059e-05, + "learning_rate": 2.5713901269842404e-07, + "logits/chosen": -2.6400136947631836, + "logits/rejected": -2.6133406162261963, + "logps/chosen": -230.74404907226562, + "logps/rejected": -669.561767578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6806840896606445, + "rewards/margins": 41.68968200683594, + "rewards/rejected": -43.370365142822266, + "step": 280 + }, + { + "epoch": 2.0, + "grad_norm": 0.006161302781600426, + "learning_rate": 2.557114896857374e-07, + "logits/chosen": -2.5869057178497314, + "logits/rejected": -2.6315174102783203, + "logps/chosen": -206.2305145263672, + "logps/rejected": -703.8362426757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.414388418197632, + "rewards/margins": 43.79607391357422, + "rewards/rejected": -46.21046447753906, + "step": 281 + }, + { + "epoch": 2.0, + "grad_norm": 7.902496879472732e-05, + "learning_rate": 2.5428378032597483e-07, + "logits/chosen": -2.6702685356140137, + "logits/rejected": -2.6129355430603027, + "logps/chosen": -230.55783081054688, + "logps/rejected": -675.521728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2104631662368774, + "rewards/margins": 41.848533630371094, + "rewards/rejected": -43.058998107910156, + "step": 282 + }, + { + "epoch": 2.01, + "grad_norm": 9.897198395623222e-05, + "learning_rate": 2.528559312005851e-07, + "logits/chosen": -2.627012014389038, + "logits/rejected": -2.569394588470459, + "logps/chosen": -198.86477661132812, + "logps/rejected": -617.51806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3218894004821777, + "rewards/margins": 39.28834533691406, + "rewards/rejected": -41.610233306884766, + "step": 283 + }, + { + "epoch": 2.02, + "grad_norm": 0.0004785055869963256, + "learning_rate": 2.5142798889557707e-07, + "logits/chosen": -2.6592307090759277, + "logits/rejected": -2.507985830307007, + "logps/chosen": -279.1988830566406, + "logps/rejected": -646.310302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3168385028839111, + "rewards/margins": 40.89637756347656, + "rewards/rejected": -42.213218688964844, + "step": 284 + }, + { + "epoch": 2.02, + "grad_norm": 0.0024097256602679394, + "learning_rate": 2.5e-07, + "logits/chosen": -2.6917591094970703, + "logits/rejected": -2.662294387817383, + "logps/chosen": -173.29786682128906, + "logps/rejected": -672.4750366210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.212879180908203, + "rewards/margins": 44.444488525390625, + "rewards/rejected": -46.6573600769043, + "step": 285 + }, + { + "epoch": 2.03, + "grad_norm": 0.0016200061904394676, + "learning_rate": 2.485720111044229e-07, + "logits/chosen": -2.665874481201172, + "logits/rejected": -2.6122257709503174, + "logps/chosen": -281.1519775390625, + "logps/rejected": -763.8772583007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.221536159515381, + "rewards/margins": 41.8398323059082, + "rewards/rejected": -45.061370849609375, + "step": 286 + }, + { + "epoch": 2.04, + "grad_norm": 9.836818154286851e-05, + "learning_rate": 2.4714406879941494e-07, + "logits/chosen": -2.6028149127960205, + "logits/rejected": -2.6029460430145264, + "logps/chosen": -234.9803466796875, + "logps/rejected": -736.207275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2461137771606445, + "rewards/margins": 43.96502685546875, + "rewards/rejected": -46.21113967895508, + "step": 287 + }, + { + "epoch": 2.05, + "grad_norm": 2.623367525270445e-05, + "learning_rate": 2.4571621967402515e-07, + "logits/chosen": -2.575897455215454, + "logits/rejected": -2.6291491985321045, + "logps/chosen": -227.8212890625, + "logps/rejected": -607.4825439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8713579177856445, + "rewards/margins": 36.08367156982422, + "rewards/rejected": -38.95503234863281, + "step": 288 + }, + { + "epoch": 2.05, + "grad_norm": 3.510866497157002e-05, + "learning_rate": 2.442885103142626e-07, + "logits/chosen": -2.6428208351135254, + "logits/rejected": -2.6158065795898438, + "logps/chosen": -225.23252868652344, + "logps/rejected": -701.7928466796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.718440532684326, + "rewards/margins": 46.5265998840332, + "rewards/rejected": -49.24504470825195, + "step": 289 + }, + { + "epoch": 2.06, + "grad_norm": 0.0025679197446976222, + "learning_rate": 2.4286098730157594e-07, + "logits/chosen": -2.6087851524353027, + "logits/rejected": -2.54359769821167, + "logps/chosen": -249.76046752929688, + "logps/rejected": -675.6961669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.213761568069458, + "rewards/margins": 41.53488540649414, + "rewards/rejected": -43.74864959716797, + "step": 290 + }, + { + "epoch": 2.07, + "grad_norm": 0.0016538273963101913, + "learning_rate": 2.4143369721133417e-07, + "logits/chosen": -2.6249642372131348, + "logits/rejected": -2.561633825302124, + "logps/chosen": -257.25384521484375, + "logps/rejected": -699.9495849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6501119136810303, + "rewards/margins": 38.6519775390625, + "rewards/rejected": -42.302085876464844, + "step": 291 + }, + { + "epoch": 2.07, + "grad_norm": 0.00029595467287005196, + "learning_rate": 2.4000668661130673e-07, + "logits/chosen": -2.6015818119049072, + "logits/rejected": -2.6032016277313232, + "logps/chosen": -228.68824768066406, + "logps/rejected": -676.7217407226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.533616781234741, + "rewards/margins": 39.18714141845703, + "rewards/rejected": -41.72075653076172, + "step": 292 + }, + { + "epoch": 2.08, + "grad_norm": 0.0002268158468589767, + "learning_rate": 2.3858000206014417e-07, + "logits/chosen": -2.6472184658050537, + "logits/rejected": -2.558506727218628, + "logps/chosen": -276.91845703125, + "logps/rejected": -703.4375610351562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.186020851135254, + "rewards/margins": 38.456382751464844, + "rewards/rejected": -41.64240646362305, + "step": 293 + }, + { + "epoch": 2.09, + "grad_norm": 0.0001480350374284824, + "learning_rate": 2.3715369010585926e-07, + "logits/chosen": -2.6376490592956543, + "logits/rejected": -2.5585012435913086, + "logps/chosen": -197.99951171875, + "logps/rejected": -628.4188232421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.551281213760376, + "rewards/margins": 38.564353942871094, + "rewards/rejected": -40.11564254760742, + "step": 294 + }, + { + "epoch": 2.1, + "grad_norm": 0.002305618946789525, + "learning_rate": 2.3572779728430797e-07, + "logits/chosen": -2.651035785675049, + "logits/rejected": -2.577578544616699, + "logps/chosen": -261.4287414550781, + "logps/rejected": -666.9348754882812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7089169025421143, + "rewards/margins": 37.88790512084961, + "rewards/rejected": -39.59682083129883, + "step": 295 + }, + { + "epoch": 2.1, + "grad_norm": 0.0034613775928909337, + "learning_rate": 2.3430237011767164e-07, + "logits/chosen": -2.5911641120910645, + "logits/rejected": -2.553837776184082, + "logps/chosen": -204.30679321289062, + "logps/rejected": -702.823486328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35330891609191895, + "rewards/margins": 44.26299285888672, + "rewards/rejected": -44.616310119628906, + "step": 296 + }, + { + "epoch": 2.11, + "grad_norm": 9.184969823604947e-06, + "learning_rate": 2.3287745511293854e-07, + "logits/chosen": -2.580634593963623, + "logits/rejected": -2.5209126472473145, + "logps/chosen": -235.25047302246094, + "logps/rejected": -680.2611083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3681640625, + "rewards/margins": 40.61575698852539, + "rewards/rejected": -42.98392105102539, + "step": 297 + }, + { + "epoch": 2.12, + "grad_norm": 0.00105454981366374, + "learning_rate": 2.3145309876038697e-07, + "logits/chosen": -2.6204841136932373, + "logits/rejected": -2.598942756652832, + "logps/chosen": -244.71981811523438, + "logps/rejected": -705.040283203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.660607933998108, + "rewards/margins": 43.9911994934082, + "rewards/rejected": -45.65180969238281, + "step": 298 + }, + { + "epoch": 2.12, + "grad_norm": 0.0072056648000134995, + "learning_rate": 2.3002934753206808e-07, + "logits/chosen": -2.608703851699829, + "logits/rejected": -2.567575454711914, + "logps/chosen": -229.6112060546875, + "logps/rejected": -691.8590087890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1537482738494873, + "rewards/margins": 39.864192962646484, + "rewards/rejected": -42.017940521240234, + "step": 299 + }, + { + "epoch": 2.13, + "grad_norm": 0.0013836047297960286, + "learning_rate": 2.2860624788029013e-07, + "logits/chosen": -2.596379041671753, + "logits/rejected": -2.5666017532348633, + "logps/chosen": -244.7979736328125, + "logps/rejected": -710.8087158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7877392768859863, + "rewards/margins": 41.28485870361328, + "rewards/rejected": -45.072593688964844, + "step": 300 + }, + { + "epoch": 2.14, + "grad_norm": 0.0008413110369675217, + "learning_rate": 2.271838462361021e-07, + "logits/chosen": -2.6119728088378906, + "logits/rejected": -2.599562168121338, + "logps/chosen": -197.46572875976562, + "logps/rejected": -650.60498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3605077266693115, + "rewards/margins": 41.13031768798828, + "rewards/rejected": -42.490821838378906, + "step": 301 + }, + { + "epoch": 2.15, + "grad_norm": 0.006898520960629149, + "learning_rate": 2.2576218900777963e-07, + "logits/chosen": -2.6114397048950195, + "logits/rejected": -2.558812141418457, + "logps/chosen": -211.0338897705078, + "logps/rejected": -649.7169189453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.402194619178772, + "rewards/margins": 41.8548583984375, + "rewards/rejected": -43.257049560546875, + "step": 302 + }, + { + "epoch": 2.15, + "grad_norm": 0.0010674806112674939, + "learning_rate": 2.2434132257931054e-07, + "logits/chosen": -2.5946736335754395, + "logits/rejected": -2.576636791229248, + "logps/chosen": -200.5345916748047, + "logps/rejected": -666.674072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.099646806716919, + "rewards/margins": 40.62255096435547, + "rewards/rejected": -41.72219467163086, + "step": 303 + }, + { + "epoch": 2.16, + "grad_norm": 0.0021846805494892224, + "learning_rate": 2.2292129330888136e-07, + "logits/chosen": -2.6382503509521484, + "logits/rejected": -2.6354548931121826, + "logps/chosen": -213.88092041015625, + "logps/rejected": -718.42822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.403564453125, + "rewards/margins": 46.48717498779297, + "rewards/rejected": -47.8907356262207, + "step": 304 + }, + { + "epoch": 2.17, + "grad_norm": 0.0016220687018345893, + "learning_rate": 2.2150214752736485e-07, + "logits/chosen": -2.556666612625122, + "logits/rejected": -2.5358595848083496, + "logps/chosen": -224.96206665039062, + "logps/rejected": -661.4259643554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0268585681915283, + "rewards/margins": 41.1568603515625, + "rewards/rejected": -44.1837158203125, + "step": 305 + }, + { + "epoch": 2.17, + "grad_norm": 0.00037503711674353416, + "learning_rate": 2.2008393153680857e-07, + "logits/chosen": -2.635861396789551, + "logits/rejected": -2.609677791595459, + "logps/chosen": -256.7283935546875, + "logps/rejected": -734.5321044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5202267169952393, + "rewards/margins": 43.601497650146484, + "rewards/rejected": -45.121726989746094, + "step": 306 + }, + { + "epoch": 2.18, + "grad_norm": 0.0005774733677771591, + "learning_rate": 2.1866669160892389e-07, + "logits/chosen": -2.586963653564453, + "logits/rejected": -2.598660469055176, + "logps/chosen": -220.0684051513672, + "logps/rejected": -716.798583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.434251546859741, + "rewards/margins": 42.49077224731445, + "rewards/rejected": -44.925018310546875, + "step": 307 + }, + { + "epoch": 2.19, + "grad_norm": 0.001751362751525598, + "learning_rate": 2.1725047398357676e-07, + "logits/chosen": -2.6196722984313965, + "logits/rejected": -2.5710983276367188, + "logps/chosen": -223.76885986328125, + "logps/rejected": -678.039794921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.606354832649231, + "rewards/margins": 43.118324279785156, + "rewards/rejected": -44.72467803955078, + "step": 308 + }, + { + "epoch": 2.2, + "grad_norm": 5.094191908734943e-05, + "learning_rate": 2.158353248672786e-07, + "logits/chosen": -2.622854709625244, + "logits/rejected": -2.5015876293182373, + "logps/chosen": -248.0614013671875, + "logps/rejected": -601.8358154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3057637214660645, + "rewards/margins": 37.946083068847656, + "rewards/rejected": -41.25184631347656, + "step": 309 + }, + { + "epoch": 2.2, + "grad_norm": 0.0003741776397116977, + "learning_rate": 2.1442129043167873e-07, + "logits/chosen": -2.59749436378479, + "logits/rejected": -2.5974643230438232, + "logps/chosen": -206.97317504882812, + "logps/rejected": -680.8038330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9993762969970703, + "rewards/margins": 41.59443283081055, + "rewards/rejected": -44.593807220458984, + "step": 310 + }, + { + "epoch": 2.21, + "grad_norm": 0.0023736380333349238, + "learning_rate": 2.1300841681205843e-07, + "logits/chosen": -2.5770013332366943, + "logits/rejected": -2.5992891788482666, + "logps/chosen": -225.75082397460938, + "logps/rejected": -676.5971069335938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0207276344299316, + "rewards/margins": 41.68965148925781, + "rewards/rejected": -43.71038055419922, + "step": 311 + }, + { + "epoch": 2.22, + "grad_norm": 8.667614968073643e-05, + "learning_rate": 2.1159675010582518e-07, + "logits/chosen": -2.5977702140808105, + "logits/rejected": -2.556772232055664, + "logps/chosen": -174.94638061523438, + "logps/rejected": -681.7984619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5728871822357178, + "rewards/margins": 44.41339874267578, + "rewards/rejected": -45.98628234863281, + "step": 312 + }, + { + "epoch": 2.22, + "grad_norm": 0.000763634501313599, + "learning_rate": 2.101863363710089e-07, + "logits/chosen": -2.605037212371826, + "logits/rejected": -2.578495979309082, + "logps/chosen": -191.00990295410156, + "logps/rejected": -634.2423706054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0012569427490234, + "rewards/margins": 38.12276840209961, + "rewards/rejected": -41.1240234375, + "step": 313 + }, + { + "epoch": 2.23, + "grad_norm": 0.0009550845588240801, + "learning_rate": 2.087772216247592e-07, + "logits/chosen": -2.6382060050964355, + "logits/rejected": -2.5790133476257324, + "logps/chosen": -272.6337890625, + "logps/rejected": -766.276123046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4370644092559814, + "rewards/margins": 44.511287689208984, + "rewards/rejected": -46.94835662841797, + "step": 314 + }, + { + "epoch": 2.24, + "grad_norm": 0.000401801373142826, + "learning_rate": 2.0736945184184403e-07, + "logits/chosen": -2.648123025894165, + "logits/rejected": -2.6512463092803955, + "logps/chosen": -156.13302612304688, + "logps/rejected": -584.9524536132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.329543948173523, + "rewards/margins": 38.95381164550781, + "rewards/rejected": -40.283355712890625, + "step": 315 + }, + { + "epoch": 2.25, + "grad_norm": 6.649777692716186e-06, + "learning_rate": 2.0596307295314958e-07, + "logits/chosen": -2.5876832008361816, + "logits/rejected": -2.5578882694244385, + "logps/chosen": -232.2501983642578, + "logps/rejected": -697.078369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.266294479370117, + "rewards/margins": 42.157958984375, + "rewards/rejected": -44.42424774169922, + "step": 316 + }, + { + "epoch": 2.25, + "grad_norm": 9.140682724368651e-05, + "learning_rate": 2.0455813084418167e-07, + "logits/chosen": -2.602666139602661, + "logits/rejected": -2.5334253311157227, + "logps/chosen": -285.1061096191406, + "logps/rejected": -680.6642456054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9621727466583252, + "rewards/margins": 39.829532623291016, + "rewards/rejected": -41.79170608520508, + "step": 317 + }, + { + "epoch": 2.26, + "grad_norm": 0.0004069614970836941, + "learning_rate": 2.0315467135356878e-07, + "logits/chosen": -2.672846555709839, + "logits/rejected": -2.6326587200164795, + "logps/chosen": -178.00967407226562, + "logps/rejected": -595.0243530273438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9435124397277832, + "rewards/margins": 37.308353424072266, + "rewards/rejected": -39.25186538696289, + "step": 318 + }, + { + "epoch": 2.27, + "grad_norm": 8.38470714528249e-05, + "learning_rate": 2.0175274027156668e-07, + "logits/chosen": -2.611708164215088, + "logits/rejected": -2.4825704097747803, + "logps/chosen": -224.2328338623047, + "logps/rejected": -599.7462768554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9980239868164062, + "rewards/margins": 36.05885314941406, + "rewards/rejected": -39.05687713623047, + "step": 319 + }, + { + "epoch": 2.27, + "grad_norm": 0.0012240242683178152, + "learning_rate": 2.0035238333856368e-07, + "logits/chosen": -2.6673340797424316, + "logits/rejected": -2.6272075176239014, + "logps/chosen": -227.26571655273438, + "logps/rejected": -684.0858154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.831061601638794, + "rewards/margins": 41.962860107421875, + "rewards/rejected": -44.79391860961914, + "step": 320 + }, + { + "epoch": 2.28, + "grad_norm": 0.00045418282223112945, + "learning_rate": 1.9895364624358921e-07, + "logits/chosen": -2.6622133255004883, + "logits/rejected": -2.591498613357544, + "logps/chosen": -260.5234069824219, + "logps/rejected": -682.2036743164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1419260501861572, + "rewards/margins": 40.579750061035156, + "rewards/rejected": -42.7216796875, + "step": 321 + }, + { + "epoch": 2.29, + "grad_norm": 0.0004378714838924887, + "learning_rate": 1.975565746228227e-07, + "logits/chosen": -2.5633544921875, + "logits/rejected": -2.539536952972412, + "logps/chosen": -224.4319610595703, + "logps/rejected": -752.7374267578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9543933868408203, + "rewards/margins": 46.37248611450195, + "rewards/rejected": -49.32687759399414, + "step": 322 + }, + { + "epoch": 2.29, + "grad_norm": 0.0028377027516268524, + "learning_rate": 1.9616121405810454e-07, + "logits/chosen": -2.5777812004089355, + "logits/rejected": -2.5560085773468018, + "logps/chosen": -240.31675720214844, + "logps/rejected": -698.7748413085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7276869416236877, + "rewards/margins": 46.07208251953125, + "rewards/rejected": -46.79977035522461, + "step": 323 + }, + { + "epoch": 2.3, + "grad_norm": 0.0016683153037944133, + "learning_rate": 1.94767610075449e-07, + "logits/chosen": -2.6497857570648193, + "logits/rejected": -2.5724897384643555, + "logps/chosen": -233.23532104492188, + "logps/rejected": -685.0447998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.107954740524292, + "rewards/margins": 39.54705810546875, + "rewards/rejected": -41.65501403808594, + "step": 324 + }, + { + "epoch": 2.31, + "grad_norm": 0.00019881845919100384, + "learning_rate": 1.9337580814355887e-07, + "logits/chosen": -2.6256721019744873, + "logits/rejected": -2.587137222290039, + "logps/chosen": -244.54515075683594, + "logps/rejected": -746.85986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2882766723632812, + "rewards/margins": 44.377628326416016, + "rewards/rejected": -45.66590118408203, + "step": 325 + }, + { + "epoch": 2.32, + "grad_norm": 0.001582618972464571, + "learning_rate": 1.919858536723421e-07, + "logits/chosen": -2.628737688064575, + "logits/rejected": -2.534581422805786, + "logps/chosen": -240.67230224609375, + "logps/rejected": -640.6585693359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.685371994972229, + "rewards/margins": 41.5238151550293, + "rewards/rejected": -43.20918655395508, + "step": 326 + }, + { + "epoch": 2.32, + "grad_norm": 0.001290971668102559, + "learning_rate": 1.9059779201142995e-07, + "logits/chosen": -2.6312174797058105, + "logits/rejected": -2.589174747467041, + "logps/chosen": -195.58197021484375, + "logps/rejected": -598.4661254882812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.398301362991333, + "rewards/margins": 33.90585708618164, + "rewards/rejected": -36.30415725708008, + "step": 327 + }, + { + "epoch": 2.33, + "grad_norm": 0.0012664768155046587, + "learning_rate": 1.892116684486976e-07, + "logits/chosen": -2.5974056720733643, + "logits/rejected": -2.633094549179077, + "logps/chosen": -200.31015014648438, + "logps/rejected": -698.3260498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7488055229187012, + "rewards/margins": 44.871368408203125, + "rewards/rejected": -46.62017822265625, + "step": 328 + }, + { + "epoch": 2.34, + "grad_norm": 0.00039048721449333115, + "learning_rate": 1.8782752820878633e-07, + "logits/chosen": -2.6140198707580566, + "logits/rejected": -2.52712345123291, + "logps/chosen": -234.4675750732422, + "logps/rejected": -654.506103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5545706748962402, + "rewards/margins": 40.421451568603516, + "rewards/rejected": -43.97602081298828, + "step": 329 + }, + { + "epoch": 2.34, + "grad_norm": 0.0017548300093342497, + "learning_rate": 1.864454164516283e-07, + "logits/chosen": -2.660311460494995, + "logits/rejected": -2.6248619556427, + "logps/chosen": -227.59027099609375, + "logps/rejected": -714.8369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7531324625015259, + "rewards/margins": 44.42414855957031, + "rewards/rejected": -45.17728042602539, + "step": 330 + }, + { + "epoch": 2.35, + "grad_norm": 0.0005959978512754461, + "learning_rate": 1.850653782709729e-07, + "logits/chosen": -2.649901866912842, + "logits/rejected": -2.581340789794922, + "logps/chosen": -259.758544921875, + "logps/rejected": -674.2744750976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6842352151870728, + "rewards/margins": 38.342628479003906, + "rewards/rejected": -40.026859283447266, + "step": 331 + }, + { + "epoch": 2.36, + "grad_norm": 3.709025643436513e-05, + "learning_rate": 1.8368745869291558e-07, + "logits/chosen": -2.5979907512664795, + "logits/rejected": -2.596640110015869, + "logps/chosen": -211.50502014160156, + "logps/rejected": -731.789306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8290282487869263, + "rewards/margins": 47.80715560913086, + "rewards/rejected": -49.63618469238281, + "step": 332 + }, + { + "epoch": 2.37, + "grad_norm": 0.0036488075345308597, + "learning_rate": 1.8231170267442868e-07, + "logits/chosen": -2.5910720825195312, + "logits/rejected": -2.5695204734802246, + "logps/chosen": -209.30166625976562, + "logps/rejected": -665.1473388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5559329986572266, + "rewards/margins": 42.57988739013672, + "rewards/rejected": -44.13582229614258, + "step": 333 + }, + { + "epoch": 2.37, + "grad_norm": 0.0006131944136766324, + "learning_rate": 1.8093815510189492e-07, + "logits/chosen": -2.5853114128112793, + "logits/rejected": -2.477041244506836, + "logps/chosen": -211.20086669921875, + "logps/rejected": -642.9722900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7581071853637695, + "rewards/margins": 40.621673583984375, + "rewards/rejected": -42.379783630371094, + "step": 334 + }, + { + "epoch": 2.38, + "grad_norm": 5.14970120234663e-05, + "learning_rate": 1.7956686078964255e-07, + "logits/chosen": -2.578829526901245, + "logits/rejected": -2.59527325630188, + "logps/chosen": -170.6302490234375, + "logps/rejected": -680.9962768554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.764241337776184, + "rewards/margins": 43.7723274230957, + "rewards/rejected": -45.53657150268555, + "step": 335 + }, + { + "epoch": 2.39, + "grad_norm": 0.0037386719633775633, + "learning_rate": 1.7819786447848343e-07, + "logits/chosen": -2.6559715270996094, + "logits/rejected": -2.619791030883789, + "logps/chosen": -250.88482666015625, + "logps/rejected": -734.7058715820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.568724274635315, + "rewards/margins": 45.02946472167969, + "rewards/rejected": -46.59818649291992, + "step": 336 + }, + { + "epoch": 2.39, + "grad_norm": 0.0001017621123061103, + "learning_rate": 1.768312108342531e-07, + "logits/chosen": -2.589627981185913, + "logits/rejected": -2.570620059967041, + "logps/chosen": -233.76475524902344, + "logps/rejected": -672.9090576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6810810565948486, + "rewards/margins": 42.62928771972656, + "rewards/rejected": -46.31037521362305, + "step": 337 + }, + { + "epoch": 2.4, + "grad_norm": 1.9160645246744767e-05, + "learning_rate": 1.7546694444635394e-07, + "logits/chosen": -2.602494478225708, + "logits/rejected": -2.5336544513702393, + "logps/chosen": -239.96519470214844, + "logps/rejected": -679.0791625976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5591204166412354, + "rewards/margins": 37.38166046142578, + "rewards/rejected": -38.94078063964844, + "step": 338 + }, + { + "epoch": 2.41, + "grad_norm": 2.0091906610524113e-05, + "learning_rate": 1.741051098262995e-07, + "logits/chosen": -2.5975148677825928, + "logits/rejected": -2.6190147399902344, + "logps/chosen": -188.81015014648438, + "logps/rejected": -659.60498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3599088191986084, + "rewards/margins": 40.257572174072266, + "rewards/rejected": -41.61748123168945, + "step": 339 + }, + { + "epoch": 2.42, + "grad_norm": 3.4593526398341734e-08, + "learning_rate": 1.7274575140626315e-07, + "logits/chosen": -2.6841962337493896, + "logits/rejected": -2.5906381607055664, + "logps/chosen": -245.50047302246094, + "logps/rejected": -710.417724609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1471208333969116, + "rewards/margins": 43.29010009765625, + "rewards/rejected": -44.43722152709961, + "step": 340 + }, + { + "epoch": 2.42, + "grad_norm": 2.7222024824694683e-06, + "learning_rate": 1.7138891353762801e-07, + "logits/chosen": -2.575801372528076, + "logits/rejected": -2.5523197650909424, + "logps/chosen": -283.1704406738281, + "logps/rejected": -638.535888671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6456997394561768, + "rewards/margins": 33.805660247802734, + "rewards/rejected": -35.45136260986328, + "step": 341 + }, + { + "epoch": 2.43, + "grad_norm": 0.0012104166824291863, + "learning_rate": 1.7003464048953977e-07, + "logits/chosen": -2.6379201412200928, + "logits/rejected": -2.618004322052002, + "logps/chosen": -222.8129425048828, + "logps/rejected": -712.9454345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5227084159851074, + "rewards/margins": 45.59199142456055, + "rewards/rejected": -48.11470031738281, + "step": 342 + }, + { + "epoch": 2.44, + "grad_norm": 0.00046587804915648864, + "learning_rate": 1.6868297644746254e-07, + "logits/chosen": -2.551722764968872, + "logits/rejected": -2.5107691287994385, + "logps/chosen": -246.80612182617188, + "logps/rejected": -664.0162353515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8306801319122314, + "rewards/margins": 41.05493927001953, + "rewards/rejected": -44.8856201171875, + "step": 343 + }, + { + "epoch": 2.44, + "grad_norm": 0.0005631611155407464, + "learning_rate": 1.6733396551173717e-07, + "logits/chosen": -2.6082587242126465, + "logits/rejected": -2.5588834285736084, + "logps/chosen": -233.0947265625, + "logps/rejected": -674.6539306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.599180221557617, + "rewards/margins": 39.047279357910156, + "rewards/rejected": -41.646461486816406, + "step": 344 + }, + { + "epoch": 2.45, + "grad_norm": 5.9372559239474194e-05, + "learning_rate": 1.6598765169614244e-07, + "logits/chosen": -2.60490083694458, + "logits/rejected": -2.5994162559509277, + "logps/chosen": -203.57110595703125, + "logps/rejected": -664.82421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9850497245788574, + "rewards/margins": 43.876808166503906, + "rewards/rejected": -45.861854553222656, + "step": 345 + }, + { + "epoch": 2.46, + "grad_norm": 2.1088871614523448e-05, + "learning_rate": 1.6464407892645893e-07, + "logits/chosen": -2.6383602619171143, + "logits/rejected": -2.630079746246338, + "logps/chosen": -234.04293823242188, + "logps/rejected": -726.9970703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.531109094619751, + "rewards/margins": 41.77968978881836, + "rewards/rejected": -43.3108024597168, + "step": 346 + }, + { + "epoch": 2.47, + "grad_norm": 0.0014021935025840203, + "learning_rate": 1.63303291039036e-07, + "logits/chosen": -2.641950845718384, + "logits/rejected": -2.546299457550049, + "logps/chosen": -285.6778564453125, + "logps/rejected": -690.0791625976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.683269500732422, + "rewards/margins": 40.81999206542969, + "rewards/rejected": -43.503257751464844, + "step": 347 + }, + { + "epoch": 2.47, + "grad_norm": 3.4666836301406825e-05, + "learning_rate": 1.619653317793613e-07, + "logits/chosen": -2.5416667461395264, + "logits/rejected": -2.52751088142395, + "logps/chosen": -210.34869384765625, + "logps/rejected": -671.1845092773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.41754150390625, + "rewards/margins": 39.833343505859375, + "rewards/rejected": -44.250885009765625, + "step": 348 + }, + { + "epoch": 2.48, + "grad_norm": 0.0007157199319366419, + "learning_rate": 1.6063024480063393e-07, + "logits/chosen": -2.6710095405578613, + "logits/rejected": -2.5739660263061523, + "logps/chosen": -227.89349365234375, + "logps/rejected": -694.7256469726562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.771583080291748, + "rewards/margins": 42.04491424560547, + "rewards/rejected": -43.81649398803711, + "step": 349 + }, + { + "epoch": 2.49, + "grad_norm": 0.00010520819831513085, + "learning_rate": 1.5929807366233977e-07, + "logits/chosen": -2.561951160430908, + "logits/rejected": -2.546168804168701, + "logps/chosen": -201.97857666015625, + "logps/rejected": -677.587158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8327938914299011, + "rewards/margins": 40.691749572753906, + "rewards/rejected": -41.524539947509766, + "step": 350 + }, + { + "epoch": 2.49, + "grad_norm": 0.0004182112792612398, + "learning_rate": 1.579688618288305e-07, + "logits/chosen": -2.615410327911377, + "logits/rejected": -2.572009801864624, + "logps/chosen": -223.80386352539062, + "logps/rejected": -629.01904296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1583346128463745, + "rewards/margins": 39.14400863647461, + "rewards/rejected": -40.302345275878906, + "step": 351 + }, + { + "epoch": 2.5, + "grad_norm": 0.0001455986729388551, + "learning_rate": 1.566426526679055e-07, + "logits/chosen": -2.594515562057495, + "logits/rejected": -2.613788604736328, + "logps/chosen": -202.57388305664062, + "logps/rejected": -716.1135864257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.149061918258667, + "rewards/margins": 44.21555709838867, + "rewards/rejected": -47.364620208740234, + "step": 352 + }, + { + "epoch": 2.51, + "grad_norm": 0.0009348587787063696, + "learning_rate": 1.553194894493967e-07, + "logits/chosen": -2.6507999897003174, + "logits/rejected": -2.549879550933838, + "logps/chosen": -244.30679321289062, + "logps/rejected": -690.1889038085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0277652740478516, + "rewards/margins": 40.70738983154297, + "rewards/rejected": -41.73515701293945, + "step": 353 + }, + { + "epoch": 2.52, + "grad_norm": 0.0015558584245408938, + "learning_rate": 1.5399941534375715e-07, + "logits/chosen": -2.5570898056030273, + "logits/rejected": -2.519944190979004, + "logps/chosen": -221.26791381835938, + "logps/rejected": -613.4049072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.230377674102783, + "rewards/margins": 34.03505325317383, + "rewards/rejected": -36.26543045043945, + "step": 354 + }, + { + "epoch": 2.52, + "grad_norm": 2.2550637506823006e-05, + "learning_rate": 1.5268247342065214e-07, + "logits/chosen": -2.663119316101074, + "logits/rejected": -2.6158151626586914, + "logps/chosen": -197.43954467773438, + "logps/rejected": -663.0337524414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.776270866394043, + "rewards/margins": 40.074867248535156, + "rewards/rejected": -42.851139068603516, + "step": 355 + }, + { + "epoch": 2.53, + "grad_norm": 7.549393134340381e-05, + "learning_rate": 1.5136870664755426e-07, + "logits/chosen": -2.608278274536133, + "logits/rejected": -2.5857372283935547, + "logps/chosen": -216.80520629882812, + "logps/rejected": -673.6428833007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.321716070175171, + "rewards/margins": 38.519309997558594, + "rewards/rejected": -41.841026306152344, + "step": 356 + }, + { + "epoch": 2.54, + "grad_norm": 0.003003858815259098, + "learning_rate": 1.5005815788834163e-07, + "logits/chosen": -2.580827236175537, + "logits/rejected": -2.584934949874878, + "logps/chosen": -241.7545928955078, + "logps/rejected": -730.5738525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.154491662979126, + "rewards/margins": 43.19963836669922, + "rewards/rejected": -46.354129791259766, + "step": 357 + }, + { + "epoch": 2.54, + "grad_norm": 8.351241665511374e-05, + "learning_rate": 1.4875086990189867e-07, + "logits/chosen": -2.5776844024658203, + "logits/rejected": -2.6143195629119873, + "logps/chosen": -207.20211791992188, + "logps/rejected": -721.2095947265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.132437229156494, + "rewards/margins": 43.729515075683594, + "rewards/rejected": -45.86195373535156, + "step": 358 + }, + { + "epoch": 2.55, + "grad_norm": 0.00012143033178613899, + "learning_rate": 1.474468853407221e-07, + "logits/chosen": -2.652557849884033, + "logits/rejected": -2.601799488067627, + "logps/chosen": -220.03244018554688, + "logps/rejected": -673.8886108398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9605017304420471, + "rewards/margins": 43.981842041015625, + "rewards/rejected": -44.942344665527344, + "step": 359 + }, + { + "epoch": 2.56, + "grad_norm": 1.2882499976377434e-05, + "learning_rate": 1.461462467495284e-07, + "logits/chosen": -2.5916476249694824, + "logits/rejected": -2.5584444999694824, + "logps/chosen": -210.636962890625, + "logps/rejected": -639.468994140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5335041284561157, + "rewards/margins": 38.12119674682617, + "rewards/rejected": -39.654701232910156, + "step": 360 + }, + { + "epoch": 2.56, + "grad_norm": 0.0004527468802473754, + "learning_rate": 1.448489965638663e-07, + "logits/chosen": -2.6484932899475098, + "logits/rejected": -2.573028326034546, + "logps/chosen": -244.07521057128906, + "logps/rejected": -662.1886596679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.61525297164917, + "rewards/margins": 39.10810852050781, + "rewards/rejected": -40.72336196899414, + "step": 361 + }, + { + "epoch": 2.57, + "grad_norm": 0.00475086777095657, + "learning_rate": 1.4355517710873182e-07, + "logits/chosen": -2.6032724380493164, + "logits/rejected": -2.6103625297546387, + "logps/chosen": -205.00706481933594, + "logps/rejected": -664.04150390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4209562540054321, + "rewards/margins": 40.957252502441406, + "rewards/rejected": -42.378211975097656, + "step": 362 + }, + { + "epoch": 2.58, + "grad_norm": 0.0009867463258715018, + "learning_rate": 1.422648305971878e-07, + "logits/chosen": -2.5331664085388184, + "logits/rejected": -2.510838508605957, + "logps/chosen": -205.19004821777344, + "logps/rejected": -637.82666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.593546986579895, + "rewards/margins": 39.659488677978516, + "rewards/rejected": -40.25303649902344, + "step": 363 + }, + { + "epoch": 2.59, + "grad_norm": 4.2845446503424804e-05, + "learning_rate": 1.4097799912898615e-07, + "logits/chosen": -2.6863322257995605, + "logits/rejected": -2.653110980987549, + "logps/chosen": -191.99600219726562, + "logps/rejected": -642.2059326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.577883243560791, + "rewards/margins": 37.816490173339844, + "rewards/rejected": -39.394371032714844, + "step": 364 + }, + { + "epoch": 2.59, + "grad_norm": 0.00011492646055263091, + "learning_rate": 1.3969472468919462e-07, + "logits/chosen": -2.5851826667785645, + "logits/rejected": -2.606987237930298, + "logps/chosen": -191.4344024658203, + "logps/rejected": -655.99462890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.764204025268555, + "rewards/margins": 40.854801177978516, + "rewards/rejected": -45.61900329589844, + "step": 365 + }, + { + "epoch": 2.6, + "grad_norm": 4.472517740181175e-05, + "learning_rate": 1.3841504914682705e-07, + "logits/chosen": -2.5719308853149414, + "logits/rejected": -2.571589231491089, + "logps/chosen": -208.75506591796875, + "logps/rejected": -744.55078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9662027955055237, + "rewards/margins": 45.28382110595703, + "rewards/rejected": -46.25001907348633, + "step": 366 + }, + { + "epoch": 2.61, + "grad_norm": 0.00011504946952320573, + "learning_rate": 1.371390142534765e-07, + "logits/chosen": -2.65636944770813, + "logits/rejected": -2.56781005859375, + "logps/chosen": -251.87567138671875, + "logps/rejected": -683.939208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0577185153961182, + "rewards/margins": 41.367767333984375, + "rewards/rejected": -42.42548370361328, + "step": 367 + }, + { + "epoch": 2.61, + "grad_norm": 1.802534654324117e-05, + "learning_rate": 1.3586666164195438e-07, + "logits/chosen": -2.6608686447143555, + "logits/rejected": -2.591402053833008, + "logps/chosen": -209.84713745117188, + "logps/rejected": -712.4869384765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5265132188796997, + "rewards/margins": 48.09614562988281, + "rewards/rejected": -48.622657775878906, + "step": 368 + }, + { + "epoch": 2.62, + "grad_norm": 0.00047667208624335917, + "learning_rate": 1.3459803282493103e-07, + "logits/chosen": -2.5973968505859375, + "logits/rejected": -2.5583910942077637, + "logps/chosen": -234.6646728515625, + "logps/rejected": -696.4039306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8548390865325928, + "rewards/margins": 43.213844299316406, + "rewards/rejected": -44.06868362426758, + "step": 369 + }, + { + "epoch": 2.63, + "grad_norm": 3.4086575030952764e-05, + "learning_rate": 1.3333316919358157e-07, + "logits/chosen": -2.6356897354125977, + "logits/rejected": -2.6185624599456787, + "logps/chosen": -238.33004760742188, + "logps/rejected": -674.8753051757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.579801559448242, + "rewards/margins": 37.65531921386719, + "rewards/rejected": -40.23512268066406, + "step": 370 + }, + { + "epoch": 2.64, + "grad_norm": 0.002182282077351439, + "learning_rate": 1.3207211201623585e-07, + "logits/chosen": -2.577144145965576, + "logits/rejected": -2.6193435192108154, + "logps/chosen": -193.91845703125, + "logps/rejected": -727.869140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6801921129226685, + "rewards/margins": 46.06007385253906, + "rewards/rejected": -47.740264892578125, + "step": 371 + }, + { + "epoch": 2.64, + "grad_norm": 0.0003152857150336939, + "learning_rate": 1.3081490243703127e-07, + "logits/chosen": -2.638441324234009, + "logits/rejected": -2.6296443939208984, + "logps/chosen": -190.9596710205078, + "logps/rejected": -676.0582275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.699096918106079, + "rewards/margins": 42.61787796020508, + "rewards/rejected": -44.316978454589844, + "step": 372 + }, + { + "epoch": 2.65, + "grad_norm": 0.006689558914866253, + "learning_rate": 1.2956158147457114e-07, + "logits/chosen": -2.7048721313476562, + "logits/rejected": -2.638596773147583, + "logps/chosen": -267.07464599609375, + "logps/rejected": -710.9136352539062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7823700904846191, + "rewards/margins": 37.170387268066406, + "rewards/rejected": -38.9527587890625, + "step": 373 + }, + { + "epoch": 2.66, + "grad_norm": 5.857269160628361e-06, + "learning_rate": 1.2831219002058594e-07, + "logits/chosen": -2.6002018451690674, + "logits/rejected": -2.5697901248931885, + "logps/chosen": -217.69949340820312, + "logps/rejected": -656.4019775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.141796350479126, + "rewards/margins": 38.67666244506836, + "rewards/rejected": -41.818458557128906, + "step": 374 + }, + { + "epoch": 2.66, + "grad_norm": 0.0026208796687012363, + "learning_rate": 1.27066768838599e-07, + "logits/chosen": -2.621732234954834, + "logits/rejected": -2.626351833343506, + "logps/chosen": -186.13662719726562, + "logps/rejected": -647.14404296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.84818172454834, + "rewards/margins": 41.564369201660156, + "rewards/rejected": -45.41255187988281, + "step": 375 + }, + { + "epoch": 2.67, + "grad_norm": 2.6363534698210247e-05, + "learning_rate": 1.2582535856259722e-07, + "logits/chosen": -2.656799793243408, + "logits/rejected": -2.5723812580108643, + "logps/chosen": -251.66204833984375, + "logps/rejected": -726.56982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7134242057800293, + "rewards/margins": 46.950050354003906, + "rewards/rejected": -48.663475036621094, + "step": 376 + }, + { + "epoch": 2.68, + "grad_norm": 5.76546846973494e-05, + "learning_rate": 1.245879996957044e-07, + "logits/chosen": -2.5991504192352295, + "logits/rejected": -2.5702285766601562, + "logps/chosen": -211.487548828125, + "logps/rejected": -703.1289672851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2510039806365967, + "rewards/margins": 44.16550064086914, + "rewards/rejected": -45.41650390625, + "step": 377 + }, + { + "epoch": 2.69, + "grad_norm": 0.001184098212604301, + "learning_rate": 1.2335473260886045e-07, + "logits/chosen": -2.6441307067871094, + "logits/rejected": -2.5999584197998047, + "logps/chosen": -256.0811462402344, + "logps/rejected": -712.192138671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08934740722179413, + "rewards/margins": 41.13957977294922, + "rewards/rejected": -41.22892379760742, + "step": 378 + }, + { + "epoch": 2.69, + "grad_norm": 0.00024597070414198076, + "learning_rate": 1.2212559753950404e-07, + "logits/chosen": -2.6292409896850586, + "logits/rejected": -2.604518175125122, + "logps/chosen": -252.38339233398438, + "logps/rejected": -746.6467895507812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3741925954818726, + "rewards/margins": 47.41543960571289, + "rewards/rejected": -48.78962707519531, + "step": 379 + }, + { + "epoch": 2.7, + "grad_norm": 0.0025421630326573313, + "learning_rate": 1.2090063459025954e-07, + "logits/chosen": -2.5926647186279297, + "logits/rejected": -2.5215797424316406, + "logps/chosen": -211.66586303710938, + "logps/rejected": -710.5953369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.025054454803467, + "rewards/margins": 45.2105712890625, + "rewards/rejected": -47.235626220703125, + "step": 380 + }, + { + "epoch": 2.71, + "grad_norm": 0.0031584492340192842, + "learning_rate": 1.1967988372762896e-07, + "logits/chosen": -2.63785982131958, + "logits/rejected": -2.549211025238037, + "logps/chosen": -235.86195373535156, + "logps/rejected": -685.3466796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.431046724319458, + "rewards/margins": 41.870277404785156, + "rewards/rejected": -44.30132293701172, + "step": 381 + }, + { + "epoch": 2.71, + "grad_norm": 0.00016908486309948843, + "learning_rate": 1.184633847806879e-07, + "logits/chosen": -2.584503650665283, + "logits/rejected": -2.5840470790863037, + "logps/chosen": -207.37738037109375, + "logps/rejected": -604.419921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0800158977508545, + "rewards/margins": 37.653812408447266, + "rewards/rejected": -40.733829498291016, + "step": 382 + }, + { + "epoch": 2.72, + "grad_norm": 0.0005062138566196237, + "learning_rate": 1.1725117743978566e-07, + "logits/chosen": -2.640798330307007, + "logits/rejected": -2.5329463481903076, + "logps/chosen": -201.2047119140625, + "logps/rejected": -646.1207275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5573039054870605, + "rewards/margins": 40.20195770263672, + "rewards/rejected": -42.75926208496094, + "step": 383 + }, + { + "epoch": 2.73, + "grad_norm": 0.0003014891658832199, + "learning_rate": 1.1604330125525078e-07, + "logits/chosen": -2.6083054542541504, + "logits/rejected": -2.56097412109375, + "logps/chosen": -224.90054321289062, + "logps/rejected": -685.1141967773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5241632461547852, + "rewards/margins": 41.382118225097656, + "rewards/rejected": -42.906280517578125, + "step": 384 + }, + { + "epoch": 2.74, + "grad_norm": 5.004037055953319e-06, + "learning_rate": 1.1483979563610069e-07, + "logits/chosen": -2.6628429889678955, + "logits/rejected": -2.589789867401123, + "logps/chosen": -196.4730224609375, + "logps/rejected": -621.3598022460938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4821809530258179, + "rewards/margins": 41.21112823486328, + "rewards/rejected": -42.69330596923828, + "step": 385 + }, + { + "epoch": 2.74, + "grad_norm": 0.00041500847069534313, + "learning_rate": 1.1364069984875502e-07, + "logits/chosen": -2.6143016815185547, + "logits/rejected": -2.603292226791382, + "logps/chosen": -214.89755249023438, + "logps/rejected": -665.7808837890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5043654441833496, + "rewards/margins": 41.834659576416016, + "rewards/rejected": -43.339027404785156, + "step": 386 + }, + { + "epoch": 2.75, + "grad_norm": 0.0007447870046766562, + "learning_rate": 1.1244605301575572e-07, + "logits/chosen": -2.596665620803833, + "logits/rejected": -2.6054022312164307, + "logps/chosen": -240.6815185546875, + "logps/rejected": -739.9364013671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.371979236602783, + "rewards/margins": 47.72554016113281, + "rewards/rejected": -50.09751892089844, + "step": 387 + }, + { + "epoch": 2.76, + "grad_norm": 0.001801721898035798, + "learning_rate": 1.1125589411448994e-07, + "logits/chosen": -2.606245279312134, + "logits/rejected": -2.6088197231292725, + "logps/chosen": -213.52740478515625, + "logps/rejected": -682.692138671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.032228946685791, + "rewards/margins": 43.08326721191406, + "rewards/rejected": -45.115501403808594, + "step": 388 + }, + { + "epoch": 2.76, + "grad_norm": 0.0013265646460463825, + "learning_rate": 1.1007026197591812e-07, + "logits/chosen": -2.6485466957092285, + "logits/rejected": -2.5848703384399414, + "logps/chosen": -252.58580017089844, + "logps/rejected": -673.0241088867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9447808265686035, + "rewards/margins": 39.774879455566406, + "rewards/rejected": -42.719661712646484, + "step": 389 + }, + { + "epoch": 2.77, + "grad_norm": 0.004005690937088928, + "learning_rate": 1.0888919528330775e-07, + "logits/chosen": -2.6455395221710205, + "logits/rejected": -2.6180739402770996, + "logps/chosen": -178.38711547851562, + "logps/rejected": -677.9312744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7555464506149292, + "rewards/margins": 46.52717590332031, + "rewards/rejected": -48.28272247314453, + "step": 390 + }, + { + "epoch": 2.78, + "grad_norm": 0.00023247665146038674, + "learning_rate": 1.077127325709705e-07, + "logits/chosen": -2.6322834491729736, + "logits/rejected": -2.6207876205444336, + "logps/chosen": -263.80767822265625, + "logps/rejected": -706.65478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.04120135307312, + "rewards/margins": 41.17344284057617, + "rewards/rejected": -43.21464538574219, + "step": 391 + }, + { + "epoch": 2.79, + "grad_norm": 0.0008606931181420953, + "learning_rate": 1.0654091222300563e-07, + "logits/chosen": -2.5458884239196777, + "logits/rejected": -2.5368337631225586, + "logps/chosen": -221.91416931152344, + "logps/rejected": -622.33740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2126212120056152, + "rewards/margins": 37.074501037597656, + "rewards/rejected": -40.2871208190918, + "step": 392 + }, + { + "epoch": 2.79, + "grad_norm": 0.004231680550935023, + "learning_rate": 1.0537377247204729e-07, + "logits/chosen": -2.6544291973114014, + "logits/rejected": -2.5833327770233154, + "logps/chosen": -249.5886993408203, + "logps/rejected": -669.6287841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6856565475463867, + "rewards/margins": 39.63022232055664, + "rewards/rejected": -42.315879821777344, + "step": 393 + }, + { + "epoch": 2.8, + "grad_norm": 0.00855989058705333, + "learning_rate": 1.04211351398017e-07, + "logits/chosen": -2.635132312774658, + "logits/rejected": -2.564526081085205, + "logps/chosen": -219.7024383544922, + "logps/rejected": -609.31103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.43125581741333, + "rewards/margins": 36.06136703491211, + "rewards/rejected": -41.49262237548828, + "step": 394 + }, + { + "epoch": 2.81, + "grad_norm": 0.0006365937608466654, + "learning_rate": 1.0305368692688174e-07, + "logits/chosen": -2.6452977657318115, + "logits/rejected": -2.583155870437622, + "logps/chosen": -204.53591918945312, + "logps/rejected": -595.3872680664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6615965366363525, + "rewards/margins": 39.1407585144043, + "rewards/rejected": -40.8023567199707, + "step": 395 + }, + { + "epoch": 2.81, + "grad_norm": 0.0005451091252313615, + "learning_rate": 1.0190081682941592e-07, + "logits/chosen": -2.636286973953247, + "logits/rejected": -2.558438777923584, + "logps/chosen": -270.0765075683594, + "logps/rejected": -597.2915649414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.357896327972412, + "rewards/margins": 35.33807373046875, + "rewards/rejected": -37.69597244262695, + "step": 396 + }, + { + "epoch": 2.82, + "grad_norm": 0.0014670215887261733, + "learning_rate": 1.0075277871996937e-07, + "logits/chosen": -2.6234705448150635, + "logits/rejected": -2.570161819458008, + "logps/chosen": -265.45263671875, + "logps/rejected": -665.8034057617188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.534239649772644, + "rewards/margins": 38.03791809082031, + "rewards/rejected": -39.57215881347656, + "step": 397 + }, + { + "epoch": 2.83, + "grad_norm": 0.001981063283650381, + "learning_rate": 9.960961005524032e-08, + "logits/chosen": -2.700791597366333, + "logits/rejected": -2.6455905437469482, + "logps/chosen": -264.726806640625, + "logps/rejected": -720.6319580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7908623218536377, + "rewards/margins": 42.555809020996094, + "rewards/rejected": -45.34667205810547, + "step": 398 + }, + { + "epoch": 2.83, + "grad_norm": 0.00011146915815739721, + "learning_rate": 9.847134813305294e-08, + "logits/chosen": -2.6453797817230225, + "logits/rejected": -2.5401418209075928, + "logps/chosen": -263.7633361816406, + "logps/rejected": -784.4583740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4651216268539429, + "rewards/margins": 47.63389587402344, + "rewards/rejected": -49.09901809692383, + "step": 399 + }, + { + "epoch": 2.84, + "grad_norm": 2.754720859276951e-05, + "learning_rate": 9.733803009114044e-08, + "logits/chosen": -2.5906217098236084, + "logits/rejected": -2.5824055671691895, + "logps/chosen": -158.09375, + "logps/rejected": -633.3319091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7996079921722412, + "rewards/margins": 39.154239654541016, + "rewards/rejected": -40.95384979248047, + "step": 400 + }, + { + "epoch": 2.85, + "grad_norm": 0.00021134453858552522, + "learning_rate": 9.620969290593381e-08, + "logits/chosen": -2.5657520294189453, + "logits/rejected": -2.5771374702453613, + "logps/chosen": -204.86314392089844, + "logps/rejected": -677.7411499023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7614800930023193, + "rewards/margins": 42.98869705200195, + "rewards/rejected": -45.750179290771484, + "step": 401 + }, + { + "epoch": 2.86, + "grad_norm": 3.642300106109489e-05, + "learning_rate": 9.508637339135472e-08, + "logits/chosen": -2.6181256771087646, + "logits/rejected": -2.6310133934020996, + "logps/chosen": -203.68988037109375, + "logps/rejected": -666.416748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.610349655151367, + "rewards/margins": 40.2132682800293, + "rewards/rejected": -43.82361602783203, + "step": 402 + }, + { + "epoch": 2.86, + "grad_norm": 0.00012917295188805592, + "learning_rate": 9.396810819761514e-08, + "logits/chosen": -2.6647543907165527, + "logits/rejected": -2.576786756515503, + "logps/chosen": -224.27777099609375, + "logps/rejected": -607.6181640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.296034097671509, + "rewards/margins": 36.055564880371094, + "rewards/rejected": -38.351593017578125, + "step": 403 + }, + { + "epoch": 2.87, + "grad_norm": 0.00012113656367676832, + "learning_rate": 9.285493381002121e-08, + "logits/chosen": -2.6379270553588867, + "logits/rejected": -2.5847413539886475, + "logps/chosen": -225.47946166992188, + "logps/rejected": -711.302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1094088554382324, + "rewards/margins": 45.114166259765625, + "rewards/rejected": -47.223567962646484, + "step": 404 + }, + { + "epoch": 2.88, + "grad_norm": 0.008868461301325964, + "learning_rate": 9.174688654778243e-08, + "logits/chosen": -2.675279140472412, + "logits/rejected": -2.5945000648498535, + "logps/chosen": -225.70831298828125, + "logps/rejected": -651.210693359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6225006580352783, + "rewards/margins": 40.46595001220703, + "rewards/rejected": -42.08844757080078, + "step": 405 + }, + { + "epoch": 2.88, + "grad_norm": 0.00026484718225131806, + "learning_rate": 9.064400256282755e-08, + "logits/chosen": -2.6134443283081055, + "logits/rejected": -2.553016424179077, + "logps/chosen": -230.35409545898438, + "logps/rejected": -712.2049560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.64622163772583, + "rewards/margins": 46.537353515625, + "rewards/rejected": -49.183570861816406, + "step": 406 + }, + { + "epoch": 2.89, + "grad_norm": 0.000636173847539082, + "learning_rate": 8.954631783862457e-08, + "logits/chosen": -2.6305606365203857, + "logits/rejected": -2.590165376663208, + "logps/chosen": -251.324951171875, + "logps/rejected": -682.8800048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3713853359222412, + "rewards/margins": 39.81315612792969, + "rewards/rejected": -41.184539794921875, + "step": 407 + }, + { + "epoch": 2.9, + "grad_norm": 0.0001759675388052945, + "learning_rate": 8.845386818900646e-08, + "logits/chosen": -2.638521194458008, + "logits/rejected": -2.5012216567993164, + "logps/chosen": -242.7923126220703, + "logps/rejected": -630.4002075195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.512739896774292, + "rewards/margins": 36.14814758300781, + "rewards/rejected": -38.660888671875, + "step": 408 + }, + { + "epoch": 2.91, + "grad_norm": 0.0022876160868569913, + "learning_rate": 8.73666892570033e-08, + "logits/chosen": -2.615971565246582, + "logits/rejected": -2.5654449462890625, + "logps/chosen": -287.05596923828125, + "logps/rejected": -765.029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.003164291381836, + "rewards/margins": 40.66764450073242, + "rewards/rejected": -44.670806884765625, + "step": 409 + }, + { + "epoch": 2.91, + "grad_norm": 1.0378383487139496e-05, + "learning_rate": 8.628481651367875e-08, + "logits/chosen": -2.5988502502441406, + "logits/rejected": -2.551170587539673, + "logps/chosen": -212.395751953125, + "logps/rejected": -683.1842041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1727066040039062, + "rewards/margins": 40.93196487426758, + "rewards/rejected": -43.10467529296875, + "step": 410 + }, + { + "epoch": 2.92, + "grad_norm": 0.0014021716863452233, + "learning_rate": 8.52082852569732e-08, + "logits/chosen": -2.5625195503234863, + "logits/rejected": -2.533140182495117, + "logps/chosen": -209.16903686523438, + "logps/rejected": -684.4560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5696115493774414, + "rewards/margins": 44.26431655883789, + "rewards/rejected": -45.83393096923828, + "step": 411 + }, + { + "epoch": 2.93, + "grad_norm": 8.825489804697242e-05, + "learning_rate": 8.413713061055206e-08, + "logits/chosen": -2.6121273040771484, + "logits/rejected": -2.5778162479400635, + "logps/chosen": -244.29971313476562, + "logps/rejected": -705.0411376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.311461925506592, + "rewards/margins": 41.75151443481445, + "rewards/rejected": -44.0629768371582, + "step": 412 + }, + { + "epoch": 2.93, + "grad_norm": 4.763638445999946e-06, + "learning_rate": 8.307138752265933e-08, + "logits/chosen": -2.5962650775909424, + "logits/rejected": -2.5747714042663574, + "logps/chosen": -220.84149169921875, + "logps/rejected": -696.8626708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.816655397415161, + "rewards/margins": 41.75939178466797, + "rewards/rejected": -44.5760498046875, + "step": 413 + }, + { + "epoch": 2.94, + "grad_norm": 0.0001758765944260812, + "learning_rate": 8.201109076497803e-08, + "logits/chosen": -2.529788017272949, + "logits/rejected": -2.5632526874542236, + "logps/chosen": -235.17721557617188, + "logps/rejected": -707.098876953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9940123558044434, + "rewards/margins": 43.75909423828125, + "rewards/rejected": -46.75310516357422, + "step": 414 + }, + { + "epoch": 2.95, + "grad_norm": 4.960700472153307e-05, + "learning_rate": 8.095627493149521e-08, + "logits/chosen": -2.6606791019439697, + "logits/rejected": -2.6119890213012695, + "logps/chosen": -253.69537353515625, + "logps/rejected": -740.9237670898438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0299689769744873, + "rewards/margins": 46.81749725341797, + "rewards/rejected": -47.84746551513672, + "step": 415 + }, + { + "epoch": 2.96, + "grad_norm": 4.843433542015454e-05, + "learning_rate": 7.990697443737335e-08, + "logits/chosen": -2.6203713417053223, + "logits/rejected": -2.561133861541748, + "logps/chosen": -249.41090393066406, + "logps/rejected": -725.7716674804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9014408588409424, + "rewards/margins": 45.909523010253906, + "rewards/rejected": -47.81096649169922, + "step": 416 + }, + { + "epoch": 2.96, + "grad_norm": 0.00014445998836812535, + "learning_rate": 7.886322351782782e-08, + "logits/chosen": -2.5868492126464844, + "logits/rejected": -2.562523126602173, + "logps/chosen": -178.42391967773438, + "logps/rejected": -647.8771362304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.039259672164917, + "rewards/margins": 42.71385955810547, + "rewards/rejected": -45.75312042236328, + "step": 417 + }, + { + "epoch": 2.97, + "grad_norm": 0.003763953993902461, + "learning_rate": 7.782505622700964e-08, + "logits/chosen": -2.6626133918762207, + "logits/rejected": -2.604381561279297, + "logps/chosen": -257.4096984863281, + "logps/rejected": -706.0999755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1248116493225098, + "rewards/margins": 41.654457092285156, + "rewards/rejected": -44.779273986816406, + "step": 418 + }, + { + "epoch": 2.98, + "grad_norm": 0.0007787913632164233, + "learning_rate": 7.67925064368942e-08, + "logits/chosen": -2.606074333190918, + "logits/rejected": -2.5244290828704834, + "logps/chosen": -207.57579040527344, + "logps/rejected": -584.1116333007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0975456237792969, + "rewards/margins": 33.3361701965332, + "rewards/rejected": -34.4337158203125, + "step": 419 + }, + { + "epoch": 2.98, + "grad_norm": 0.0005382026604040729, + "learning_rate": 7.576560783617667e-08, + "logits/chosen": -2.5749430656433105, + "logits/rejected": -2.5056445598602295, + "logps/chosen": -240.61715698242188, + "logps/rejected": -659.0277709960938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.137559175491333, + "rewards/margins": 37.74388122558594, + "rewards/rejected": -40.88144302368164, + "step": 420 + }, + { + "epoch": 2.99, + "grad_norm": 9.292779493030336e-06, + "learning_rate": 7.474439392917225e-08, + "logits/chosen": -2.584118604660034, + "logits/rejected": -2.493955135345459, + "logps/chosen": -256.4057312011719, + "logps/rejected": -658.6982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.002318859100342, + "rewards/margins": 41.96442413330078, + "rewards/rejected": -44.966739654541016, + "step": 421 + }, + { + "epoch": 3.0, + "grad_norm": 0.0001844743295156698, + "learning_rate": 7.372889803472357e-08, + "logits/chosen": -2.638542652130127, + "logits/rejected": -2.52860164642334, + "logps/chosen": -261.7498474121094, + "logps/rejected": -678.9000244140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0135116577148438, + "rewards/margins": 41.05814743041992, + "rewards/rejected": -43.0716552734375, + "step": 422 + }, + { + "epoch": 3.01, + "grad_norm": 0.00015675429860846467, + "learning_rate": 7.271915328511341e-08, + "logits/chosen": -2.619159460067749, + "logits/rejected": -2.5243239402770996, + "logps/chosen": -232.90634155273438, + "logps/rejected": -631.2651977539062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.388845205307007, + "rewards/margins": 39.444183349609375, + "rewards/rejected": -41.833030700683594, + "step": 423 + }, + { + "epoch": 3.01, + "grad_norm": 0.00017903404005817177, + "learning_rate": 7.17151926249832e-08, + "logits/chosen": -2.5897724628448486, + "logits/rejected": -2.5546176433563232, + "logps/chosen": -226.81173706054688, + "logps/rejected": -638.1729736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.855764150619507, + "rewards/margins": 36.43244171142578, + "rewards/rejected": -40.2882080078125, + "step": 424 + }, + { + "epoch": 3.02, + "grad_norm": 2.0960710800808192e-05, + "learning_rate": 7.071704881025914e-08, + "logits/chosen": -2.6414356231689453, + "logits/rejected": -2.56823992729187, + "logps/chosen": -218.68051147460938, + "logps/rejected": -666.3444213867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4178571701049805, + "rewards/margins": 44.369606018066406, + "rewards/rejected": -45.7874641418457, + "step": 425 + }, + { + "epoch": 3.03, + "grad_norm": 0.0007854590569850007, + "learning_rate": 6.97247544070827e-08, + "logits/chosen": -2.6744937896728516, + "logits/rejected": -2.649486541748047, + "logps/chosen": -222.39822387695312, + "logps/rejected": -639.3123779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8207868337631226, + "rewards/margins": 42.12135314941406, + "rewards/rejected": -43.942138671875, + "step": 426 + }, + { + "epoch": 3.03, + "grad_norm": 0.0002935144167188196, + "learning_rate": 6.873834179074828e-08, + "logits/chosen": -2.6584057807922363, + "logits/rejected": -2.590904712677002, + "logps/chosen": -260.83355712890625, + "logps/rejected": -696.1884765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7906746864318848, + "rewards/margins": 44.078712463378906, + "rewards/rejected": -45.86939239501953, + "step": 427 + }, + { + "epoch": 3.04, + "grad_norm": 0.00013518808528707293, + "learning_rate": 6.775784314464716e-08, + "logits/chosen": -2.6212317943573, + "logits/rejected": -2.5604944229125977, + "logps/chosen": -265.4521484375, + "logps/rejected": -680.1613159179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1763235330581665, + "rewards/margins": 40.19994354248047, + "rewards/rejected": -41.37626647949219, + "step": 428 + }, + { + "epoch": 3.05, + "grad_norm": 0.0007385701194043996, + "learning_rate": 6.678329045921705e-08, + "logits/chosen": -2.6681649684906006, + "logits/rejected": -2.637259006500244, + "logps/chosen": -267.07244873046875, + "logps/rejected": -776.3245849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3884406089782715, + "rewards/margins": 47.06909942626953, + "rewards/rejected": -48.45753860473633, + "step": 429 + }, + { + "epoch": 3.06, + "grad_norm": 0.0004289170765386423, + "learning_rate": 6.581471553089874e-08, + "logits/chosen": -2.6739063262939453, + "logits/rejected": -2.609889030456543, + "logps/chosen": -230.77340698242188, + "logps/rejected": -715.3961181640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4532427787780762, + "rewards/margins": 44.552433013916016, + "rewards/rejected": -46.00567626953125, + "step": 430 + }, + { + "epoch": 3.06, + "grad_norm": 0.00010058556584155201, + "learning_rate": 6.485214996109856e-08, + "logits/chosen": -2.622065544128418, + "logits/rejected": -2.601836919784546, + "logps/chosen": -204.00198364257812, + "logps/rejected": -671.0438842773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9684455394744873, + "rewards/margins": 41.534996032714844, + "rewards/rejected": -43.503440856933594, + "step": 431 + }, + { + "epoch": 3.07, + "grad_norm": 0.0007467877345351995, + "learning_rate": 6.389562515515707e-08, + "logits/chosen": -2.6374704837799072, + "logits/rejected": -2.6155998706817627, + "logps/chosen": -194.61004638671875, + "logps/rejected": -704.1629638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.849491596221924, + "rewards/margins": 44.99718475341797, + "rewards/rejected": -47.846675872802734, + "step": 432 + }, + { + "epoch": 3.08, + "grad_norm": 0.00044670088710771624, + "learning_rate": 6.294517232132465e-08, + "logits/chosen": -2.659945249557495, + "logits/rejected": -2.5465316772460938, + "logps/chosen": -250.86761474609375, + "logps/rejected": -666.8660888671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.848118305206299, + "rewards/margins": 38.25925064086914, + "rewards/rejected": -41.10737228393555, + "step": 433 + }, + { + "epoch": 3.08, + "grad_norm": 6.48868533989718e-05, + "learning_rate": 6.200082246974355e-08, + "logits/chosen": -2.6127500534057617, + "logits/rejected": -2.5741114616394043, + "logps/chosen": -184.47512817382812, + "logps/rejected": -688.40576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6410200595855713, + "rewards/margins": 45.58583068847656, + "rewards/rejected": -48.22685241699219, + "step": 434 + }, + { + "epoch": 3.09, + "grad_norm": 9.791014829118608e-05, + "learning_rate": 6.106260641143546e-08, + "logits/chosen": -2.5773262977600098, + "logits/rejected": -2.570362091064453, + "logps/chosen": -170.28579711914062, + "logps/rejected": -666.8358154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7577791213989258, + "rewards/margins": 46.96244430541992, + "rewards/rejected": -47.7202262878418, + "step": 435 + }, + { + "epoch": 3.1, + "grad_norm": 1.5004390251260581e-05, + "learning_rate": 6.01305547572968e-08, + "logits/chosen": -2.6195147037506104, + "logits/rejected": -2.5563650131225586, + "logps/chosen": -269.1353759765625, + "logps/rejected": -731.1051635742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.232271194458008, + "rewards/margins": 44.905609130859375, + "rewards/rejected": -48.13787841796875, + "step": 436 + }, + { + "epoch": 3.1, + "grad_norm": 0.00032673267499134964, + "learning_rate": 5.920469791709992e-08, + "logits/chosen": -2.5946974754333496, + "logits/rejected": -2.568236827850342, + "logps/chosen": -214.07272338867188, + "logps/rejected": -672.0029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2888336181640625, + "rewards/margins": 41.524375915527344, + "rewards/rejected": -43.813209533691406, + "step": 437 + }, + { + "epoch": 3.11, + "grad_norm": 4.4418244750992484e-05, + "learning_rate": 5.828506609850053e-08, + "logits/chosen": -2.617732048034668, + "logits/rejected": -2.509762763977051, + "logps/chosen": -297.3631591796875, + "logps/rejected": -641.6201782226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8634636402130127, + "rewards/margins": 35.527565002441406, + "rewards/rejected": -37.39102554321289, + "step": 438 + }, + { + "epoch": 3.12, + "grad_norm": 0.0007374603325927721, + "learning_rate": 5.737168930605271e-08, + "logits/chosen": -2.5229742527008057, + "logits/rejected": -2.521803379058838, + "logps/chosen": -213.2022247314453, + "logps/rejected": -588.3814697265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8263041973114014, + "rewards/margins": 36.19573211669922, + "rewards/rejected": -39.022037506103516, + "step": 439 + }, + { + "epoch": 3.13, + "grad_norm": 5.365663900600486e-05, + "learning_rate": 5.6464597340229375e-08, + "logits/chosen": -2.592064380645752, + "logits/rejected": -2.549136161804199, + "logps/chosen": -228.0621337890625, + "logps/rejected": -695.7317504882812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.631687164306641, + "rewards/margins": 43.12358474731445, + "rewards/rejected": -47.755271911621094, + "step": 440 + }, + { + "epoch": 3.13, + "grad_norm": 1.576471638835058e-05, + "learning_rate": 5.55638197964505e-08, + "logits/chosen": -2.6317970752716064, + "logits/rejected": -2.578443765640259, + "logps/chosen": -222.76080322265625, + "logps/rejected": -696.7114868164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8347560167312622, + "rewards/margins": 47.23423767089844, + "rewards/rejected": -49.068992614746094, + "step": 441 + }, + { + "epoch": 3.14, + "grad_norm": 0.0005162635517962829, + "learning_rate": 5.4669386064117306e-08, + "logits/chosen": -2.6166346073150635, + "logits/rejected": -2.5035459995269775, + "logps/chosen": -235.9587860107422, + "logps/rejected": -633.7101440429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.359239339828491, + "rewards/margins": 40.840423583984375, + "rewards/rejected": -43.19966506958008, + "step": 442 + }, + { + "epoch": 3.15, + "grad_norm": 0.0002706848195368485, + "learning_rate": 5.378132532565302e-08, + "logits/chosen": -2.6102285385131836, + "logits/rejected": -2.6179120540618896, + "logps/chosen": -191.69741821289062, + "logps/rejected": -694.4356079101562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2505335807800293, + "rewards/margins": 45.38935470581055, + "rewards/rejected": -47.639892578125, + "step": 443 + }, + { + "epoch": 3.15, + "grad_norm": 6.803961643155605e-06, + "learning_rate": 5.289966655555145e-08, + "logits/chosen": -2.6046628952026367, + "logits/rejected": -2.528510093688965, + "logps/chosen": -219.8808135986328, + "logps/rejected": -701.1609497070312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1479681730270386, + "rewards/margins": 43.49080276489258, + "rewards/rejected": -44.638771057128906, + "step": 444 + }, + { + "epoch": 3.16, + "grad_norm": 0.0002550053832259183, + "learning_rate": 5.202443851943125e-08, + "logits/chosen": -2.630103349685669, + "logits/rejected": -2.6082844734191895, + "logps/chosen": -234.16244506835938, + "logps/rejected": -662.3298950195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9906439781188965, + "rewards/margins": 38.79182434082031, + "rewards/rejected": -40.78246307373047, + "step": 445 + }, + { + "epoch": 3.17, + "grad_norm": 0.0010991099474336478, + "learning_rate": 5.1155669773097237e-08, + "logits/chosen": -2.596006393432617, + "logits/rejected": -2.5671186447143555, + "logps/chosen": -204.6609344482422, + "logps/rejected": -633.6109619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.504817008972168, + "rewards/margins": 37.750221252441406, + "rewards/rejected": -40.255035400390625, + "step": 446 + }, + { + "epoch": 3.18, + "grad_norm": 0.0011109072080535067, + "learning_rate": 5.029338866160912e-08, + "logits/chosen": -2.673215866088867, + "logits/rejected": -2.615715742111206, + "logps/chosen": -235.2247314453125, + "logps/rejected": -690.780517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.436384439468384, + "rewards/margins": 38.867286682128906, + "rewards/rejected": -42.30366897583008, + "step": 447 + }, + { + "epoch": 3.18, + "grad_norm": 1.5159955147072827e-06, + "learning_rate": 4.943762331835621e-08, + "logits/chosen": -2.6637094020843506, + "logits/rejected": -2.6242637634277344, + "logps/chosen": -260.738525390625, + "logps/rejected": -713.22802734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2674665451049805, + "rewards/margins": 42.369041442871094, + "rewards/rejected": -46.636505126953125, + "step": 448 + }, + { + "epoch": 3.19, + "grad_norm": 0.0007982793586033864, + "learning_rate": 4.8588401664140075e-08, + "logits/chosen": -2.6574273109436035, + "logits/rejected": -2.6445178985595703, + "logps/chosen": -253.44577026367188, + "logps/rejected": -814.5030517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14744363725185394, + "rewards/margins": 52.01709747314453, + "rewards/rejected": -52.1645393371582, + "step": 449 + }, + { + "epoch": 3.2, + "grad_norm": 0.001282969735919351, + "learning_rate": 4.774575140626316e-08, + "logits/chosen": -2.621854305267334, + "logits/rejected": -2.570544481277466, + "logps/chosen": -247.10720825195312, + "logps/rejected": -615.7158813476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2117457389831543, + "rewards/margins": 34.38936996459961, + "rewards/rejected": -37.601112365722656, + "step": 450 + }, + { + "epoch": 3.2, + "grad_norm": 9.454149616680205e-05, + "learning_rate": 4.690970003762487e-08, + "logits/chosen": -2.6110453605651855, + "logits/rejected": -2.55108642578125, + "logps/chosen": -273.7696838378906, + "logps/rejected": -664.56787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.187952756881714, + "rewards/margins": 37.03415298461914, + "rewards/rejected": -40.222103118896484, + "step": 451 + }, + { + "epoch": 3.21, + "grad_norm": 5.947931444527806e-05, + "learning_rate": 4.608027483582458e-08, + "logits/chosen": -2.6085867881774902, + "logits/rejected": -2.575075149536133, + "logps/chosen": -252.9950408935547, + "logps/rejected": -711.0648193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9339390993118286, + "rewards/margins": 40.951297760009766, + "rewards/rejected": -42.88523864746094, + "step": 452 + }, + { + "epoch": 3.22, + "grad_norm": 1.6295645545392258e-07, + "learning_rate": 4.5257502862271865e-08, + "logits/chosen": -2.673825979232788, + "logits/rejected": -2.613050937652588, + "logps/chosen": -227.05526733398438, + "logps/rejected": -724.9359741210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0232808589935303, + "rewards/margins": 44.69103240966797, + "rewards/rejected": -47.71431350708008, + "step": 453 + }, + { + "epoch": 3.23, + "grad_norm": 0.00034465030146641835, + "learning_rate": 4.4441410961303216e-08, + "logits/chosen": -2.6254165172576904, + "logits/rejected": -2.580132484436035, + "logps/chosen": -226.70742797851562, + "logps/rejected": -731.5558471679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.111647605895996, + "rewards/margins": 41.85982131958008, + "rewards/rejected": -43.971466064453125, + "step": 454 + }, + { + "epoch": 3.23, + "grad_norm": 0.0005355842316390446, + "learning_rate": 4.3632025759306494e-08, + "logits/chosen": -2.6284639835357666, + "logits/rejected": -2.5551962852478027, + "logps/chosen": -253.06008911132812, + "logps/rejected": -653.2129516601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4641356468200684, + "rewards/margins": 39.57658386230469, + "rewards/rejected": -42.04071807861328, + "step": 455 + }, + { + "epoch": 3.24, + "grad_norm": 0.00017449806764697394, + "learning_rate": 4.282937366385214e-08, + "logits/chosen": -2.5922813415527344, + "logits/rejected": -2.5443663597106934, + "logps/chosen": -185.9475860595703, + "logps/rejected": -571.4390869140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.181006908416748, + "rewards/margins": 35.94232177734375, + "rewards/rejected": -38.123329162597656, + "step": 456 + }, + { + "epoch": 3.25, + "grad_norm": 0.00010335418438558921, + "learning_rate": 4.203348086283129e-08, + "logits/chosen": -2.581742763519287, + "logits/rejected": -2.5747532844543457, + "logps/chosen": -230.5684356689453, + "logps/rejected": -672.25, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5278213024139404, + "rewards/margins": 40.950347900390625, + "rewards/rejected": -42.47816467285156, + "step": 457 + }, + { + "epoch": 3.25, + "grad_norm": 0.0013226129906461859, + "learning_rate": 4.124437332360187e-08, + "logits/chosen": -2.617544174194336, + "logits/rejected": -2.5754756927490234, + "logps/chosen": -219.6588592529297, + "logps/rejected": -656.9884033203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6963422894477844, + "rewards/margins": 42.76883316040039, + "rewards/rejected": -42.07249069213867, + "step": 458 + }, + { + "epoch": 3.26, + "grad_norm": 0.0001715327073755561, + "learning_rate": 4.0462076792140864e-08, + "logits/chosen": -2.656161308288574, + "logits/rejected": -2.5836374759674072, + "logps/chosen": -239.05865478515625, + "logps/rejected": -669.0706787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6512041091918945, + "rewards/margins": 41.9021110534668, + "rewards/rejected": -46.553314208984375, + "step": 459 + }, + { + "epoch": 3.27, + "grad_norm": 0.0009731090118499727, + "learning_rate": 3.968661679220467e-08, + "logits/chosen": -2.6088342666625977, + "logits/rejected": -2.636565685272217, + "logps/chosen": -205.7718505859375, + "logps/rejected": -702.478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4405109882354736, + "rewards/margins": 43.994564056396484, + "rewards/rejected": -45.43507385253906, + "step": 460 + }, + { + "epoch": 3.28, + "grad_norm": 8.397492754504226e-05, + "learning_rate": 3.8918018624496286e-08, + "logits/chosen": -2.5314102172851562, + "logits/rejected": -2.5911784172058105, + "logps/chosen": -264.538818359375, + "logps/rejected": -663.1170043945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.880234241485596, + "rewards/margins": 35.40547561645508, + "rewards/rejected": -40.28571319580078, + "step": 461 + }, + { + "epoch": 3.28, + "grad_norm": 0.0006625360748079051, + "learning_rate": 3.815630736583938e-08, + "logits/chosen": -2.5914340019226074, + "logits/rejected": -2.596475124359131, + "logps/chosen": -176.09242248535156, + "logps/rejected": -637.4105224609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.100076198577881, + "rewards/margins": 41.66419982910156, + "rewards/rejected": -43.764278411865234, + "step": 462 + }, + { + "epoch": 3.29, + "grad_norm": 0.0004801495424052975, + "learning_rate": 3.740150786836085e-08, + "logits/chosen": -2.636920213699341, + "logits/rejected": -2.523773431777954, + "logps/chosen": -249.05508422851562, + "logps/rejected": -673.0238037109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.301358699798584, + "rewards/margins": 43.87980270385742, + "rewards/rejected": -47.18115997314453, + "step": 463 + }, + { + "epoch": 3.3, + "grad_norm": 0.0009906118242465466, + "learning_rate": 3.6653644758679573e-08, + "logits/chosen": -2.5939364433288574, + "logits/rejected": -2.615522623062134, + "logps/chosen": -223.13760375976562, + "logps/rejected": -715.3855590820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.248537540435791, + "rewards/margins": 39.70538330078125, + "rewards/rejected": -41.953922271728516, + "step": 464 + }, + { + "epoch": 3.3, + "grad_norm": 0.0002533772466087041, + "learning_rate": 3.5912742437102765e-08, + "logits/chosen": -2.610377788543701, + "logits/rejected": -2.5832855701446533, + "logps/chosen": -201.83145141601562, + "logps/rejected": -694.4752197265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0029077529907227, + "rewards/margins": 40.71278381347656, + "rewards/rejected": -43.71569061279297, + "step": 465 + }, + { + "epoch": 3.31, + "grad_norm": 0.0009222904010930973, + "learning_rate": 3.517882507683023e-08, + "logits/chosen": -2.585434675216675, + "logits/rejected": -2.5781028270721436, + "logps/chosen": -215.09523010253906, + "logps/rejected": -676.3477783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6579089164733887, + "rewards/margins": 39.08197021484375, + "rewards/rejected": -42.73987579345703, + "step": 466 + }, + { + "epoch": 3.32, + "grad_norm": 0.0008526568343094957, + "learning_rate": 3.44519166231656e-08, + "logits/chosen": -2.623109817504883, + "logits/rejected": -2.5855765342712402, + "logps/chosen": -215.90774536132812, + "logps/rejected": -714.1204223632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0286612510681152, + "rewards/margins": 46.990264892578125, + "rewards/rejected": -49.0189208984375, + "step": 467 + }, + { + "epoch": 3.33, + "grad_norm": 6.633828834920054e-06, + "learning_rate": 3.373204079273473e-08, + "logits/chosen": -2.559525966644287, + "logits/rejected": -2.552218437194824, + "logps/chosen": -162.26724243164062, + "logps/rejected": -622.2674560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8777978420257568, + "rewards/margins": 38.93651580810547, + "rewards/rejected": -40.81431579589844, + "step": 468 + }, + { + "epoch": 3.33, + "grad_norm": 0.0006665917719801269, + "learning_rate": 3.301922107271243e-08, + "logits/chosen": -2.596078634262085, + "logits/rejected": -2.4895009994506836, + "logps/chosen": -212.92372131347656, + "logps/rejected": -650.4541015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6449453830718994, + "rewards/margins": 38.674232482910156, + "rewards/rejected": -40.31917953491211, + "step": 469 + }, + { + "epoch": 3.34, + "grad_norm": 7.649758307006021e-05, + "learning_rate": 3.231348072005574e-08, + "logits/chosen": -2.6433959007263184, + "logits/rejected": -2.5822606086730957, + "logps/chosen": -251.4542694091797, + "logps/rejected": -687.633544921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6771730184555054, + "rewards/margins": 40.624908447265625, + "rewards/rejected": -42.30208206176758, + "step": 470 + }, + { + "epoch": 3.35, + "grad_norm": 8.506234929457733e-06, + "learning_rate": 3.1614842760745276e-08, + "logits/chosen": -2.6340794563293457, + "logits/rejected": -2.5515761375427246, + "logps/chosen": -235.64590454101562, + "logps/rejected": -704.73388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6118698120117188, + "rewards/margins": 44.503517150878906, + "rewards/rejected": -48.115386962890625, + "step": 471 + }, + { + "epoch": 3.35, + "grad_norm": 0.0001168482515299689, + "learning_rate": 3.092332998903416e-08, + "logits/chosen": -2.6512835025787354, + "logits/rejected": -2.590428113937378, + "logps/chosen": -290.42193603515625, + "logps/rejected": -771.2406005859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8223824501037598, + "rewards/margins": 47.038368225097656, + "rewards/rejected": -49.860748291015625, + "step": 472 + }, + { + "epoch": 3.36, + "grad_norm": 0.0004975248970249615, + "learning_rate": 3.023896496670383e-08, + "logits/chosen": -2.5856897830963135, + "logits/rejected": -2.576575756072998, + "logps/chosen": -210.48031616210938, + "logps/rejected": -677.15576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8332419395446777, + "rewards/margins": 43.8298454284668, + "rewards/rejected": -46.6630859375, + "step": 473 + }, + { + "epoch": 3.37, + "grad_norm": 4.821281310025656e-05, + "learning_rate": 2.9561770022328543e-08, + "logits/chosen": -2.6261911392211914, + "logits/rejected": -2.6302576065063477, + "logps/chosen": -178.09519958496094, + "logps/rejected": -635.4549560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.901778221130371, + "rewards/margins": 39.83304214477539, + "rewards/rejected": -42.73482131958008, + "step": 474 + }, + { + "epoch": 3.37, + "grad_norm": 7.455565948878168e-05, + "learning_rate": 2.889176725054643e-08, + "logits/chosen": -2.5789871215820312, + "logits/rejected": -2.5514464378356934, + "logps/chosen": -214.8177032470703, + "logps/rejected": -716.22314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.622589588165283, + "rewards/margins": 45.57851791381836, + "rewards/rejected": -49.20111083984375, + "step": 475 + }, + { + "epoch": 3.38, + "grad_norm": 0.0009251900634951763, + "learning_rate": 2.8228978511338653e-08, + "logits/chosen": -2.676788806915283, + "logits/rejected": -2.570706844329834, + "logps/chosen": -249.06582641601562, + "logps/rejected": -681.4833374023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29381561279296875, + "rewards/margins": 43.217472076416016, + "rewards/rejected": -42.92365264892578, + "step": 476 + }, + { + "epoch": 3.39, + "grad_norm": 2.895716274006362e-05, + "learning_rate": 2.7573425429316427e-08, + "logits/chosen": -2.625197649002075, + "logits/rejected": -2.6218719482421875, + "logps/chosen": -209.53350830078125, + "logps/rejected": -661.5867309570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1433513164520264, + "rewards/margins": 43.01491928100586, + "rewards/rejected": -45.15827178955078, + "step": 477 + }, + { + "epoch": 3.4, + "grad_norm": 5.0732092107721015e-05, + "learning_rate": 2.6925129393015194e-08, + "logits/chosen": -2.6054232120513916, + "logits/rejected": -2.5815982818603516, + "logps/chosen": -241.37652587890625, + "logps/rejected": -731.122802734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7485368251800537, + "rewards/margins": 46.736846923828125, + "rewards/rejected": -49.485382080078125, + "step": 478 + }, + { + "epoch": 3.4, + "grad_norm": 0.0004648308719619941, + "learning_rate": 2.628411155419702e-08, + "logits/chosen": -2.6175825595855713, + "logits/rejected": -2.5356669425964355, + "logps/chosen": -303.48773193359375, + "logps/rejected": -730.9442749023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6255733966827393, + "rewards/margins": 42.03935241699219, + "rewards/rejected": -44.66492462158203, + "step": 479 + }, + { + "epoch": 3.41, + "grad_norm": 0.00011207090812682462, + "learning_rate": 2.5650392827160443e-08, + "logits/chosen": -2.599677324295044, + "logits/rejected": -2.5649120807647705, + "logps/chosen": -207.89907836914062, + "logps/rejected": -661.1048583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4871084690093994, + "rewards/margins": 41.663997650146484, + "rewards/rejected": -43.151100158691406, + "step": 480 + }, + { + "epoch": 3.42, + "grad_norm": 0.00015545009269222758, + "learning_rate": 2.5023993888057814e-08, + "logits/chosen": -2.6464436054229736, + "logits/rejected": -2.5787558555603027, + "logps/chosen": -235.5123748779297, + "logps/rejected": -659.9240112304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2386937141418457, + "rewards/margins": 36.04393768310547, + "rewards/rejected": -38.28262710571289, + "step": 481 + }, + { + "epoch": 3.42, + "grad_norm": 0.0002233972408463592, + "learning_rate": 2.4404935174221153e-08, + "logits/chosen": -2.567603588104248, + "logits/rejected": -2.5298640727996826, + "logps/chosen": -213.93267822265625, + "logps/rejected": -597.7510375976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.363161563873291, + "rewards/margins": 35.06783676147461, + "rewards/rejected": -38.430999755859375, + "step": 482 + }, + { + "epoch": 3.43, + "grad_norm": 0.0012324860427763852, + "learning_rate": 2.379323688349516e-08, + "logits/chosen": -2.5774900913238525, + "logits/rejected": -2.6287176609039307, + "logps/chosen": -171.79183959960938, + "logps/rejected": -635.9149780273438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9400241374969482, + "rewards/margins": 38.67039489746094, + "rewards/rejected": -41.61042022705078, + "step": 483 + }, + { + "epoch": 3.44, + "grad_norm": 0.00037494952726802256, + "learning_rate": 2.3188918973577943e-08, + "logits/chosen": -2.6392977237701416, + "logits/rejected": -2.5665433406829834, + "logps/chosen": -283.3963928222656, + "logps/rejected": -645.909423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6416029930114746, + "rewards/margins": 37.30474853515625, + "rewards/rejected": -40.94635009765625, + "step": 484 + }, + { + "epoch": 3.45, + "grad_norm": 0.0011688828079140538, + "learning_rate": 2.259200116137039e-08, + "logits/chosen": -2.5678482055664062, + "logits/rejected": -2.5209760665893555, + "logps/chosen": -205.92849731445312, + "logps/rejected": -627.2208862304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3394737243652344, + "rewards/margins": 37.12411880493164, + "rewards/rejected": -39.463592529296875, + "step": 485 + }, + { + "epoch": 3.45, + "grad_norm": 0.0004476742034145017, + "learning_rate": 2.200250292233252e-08, + "logits/chosen": -2.6382713317871094, + "logits/rejected": -2.556351661682129, + "logps/chosen": -245.54376220703125, + "logps/rejected": -639.1724853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.42406964302063, + "rewards/margins": 36.78460693359375, + "rewards/rejected": -40.20867919921875, + "step": 486 + }, + { + "epoch": 3.46, + "grad_norm": 0.0012053657353027721, + "learning_rate": 2.1420443489848032e-08, + "logits/chosen": -2.61869478225708, + "logits/rejected": -2.568211317062378, + "logps/chosen": -243.75865173339844, + "logps/rejected": -678.59765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5160446166992188, + "rewards/margins": 40.444679260253906, + "rewards/rejected": -42.96072006225586, + "step": 487 + }, + { + "epoch": 3.47, + "grad_norm": 0.0010114045558550758, + "learning_rate": 2.084584185459709e-08, + "logits/chosen": -2.5804500579833984, + "logits/rejected": -2.5760929584503174, + "logps/chosen": -217.6485137939453, + "logps/rejected": -607.8234252929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5181527137756348, + "rewards/margins": 34.77845001220703, + "rewards/rejected": -37.29660415649414, + "step": 488 + }, + { + "epoch": 3.47, + "grad_norm": 5.336195329667559e-05, + "learning_rate": 2.0278716763936333e-08, + "logits/chosen": -2.6508703231811523, + "logits/rejected": -2.6321702003479004, + "logps/chosen": -210.61563110351562, + "logps/rejected": -636.890380859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6222862005233765, + "rewards/margins": 40.196022033691406, + "rewards/rejected": -41.81830978393555, + "step": 489 + }, + { + "epoch": 3.48, + "grad_norm": 0.0006390370626577082, + "learning_rate": 1.97190867212875e-08, + "logits/chosen": -2.6204140186309814, + "logits/rejected": -2.6479151248931885, + "logps/chosen": -226.50967407226562, + "logps/rejected": -688.518798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1824357509613037, + "rewards/margins": 43.68347930908203, + "rewards/rejected": -45.86591720581055, + "step": 490 + }, + { + "epoch": 3.49, + "grad_norm": 0.0009767070044428706, + "learning_rate": 1.9166969985533633e-08, + "logits/chosen": -2.5982120037078857, + "logits/rejected": -2.560797929763794, + "logps/chosen": -201.7589111328125, + "logps/rejected": -692.314208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.432668924331665, + "rewards/margins": 44.250152587890625, + "rewards/rejected": -47.68282699584961, + "step": 491 + }, + { + "epoch": 3.5, + "grad_norm": 0.0004896318197236323, + "learning_rate": 1.8622384570423283e-08, + "logits/chosen": -2.628178596496582, + "logits/rejected": -2.5770137310028076, + "logps/chosen": -261.781982421875, + "logps/rejected": -755.8809814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.642843008041382, + "rewards/margins": 46.544677734375, + "rewards/rejected": -49.18751525878906, + "step": 492 + }, + { + "epoch": 3.5, + "grad_norm": 3.687424866364429e-05, + "learning_rate": 1.8085348243982945e-08, + "logits/chosen": -2.6168880462646484, + "logits/rejected": -2.59796142578125, + "logps/chosen": -224.72430419921875, + "logps/rejected": -670.9227294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.284109354019165, + "rewards/margins": 41.57099914550781, + "rewards/rejected": -44.855106353759766, + "step": 493 + }, + { + "epoch": 3.51, + "grad_norm": 1.621604755607951e-05, + "learning_rate": 1.7555878527937163e-08, + "logits/chosen": -2.6195032596588135, + "logits/rejected": -2.586146354675293, + "logps/chosen": -205.8924102783203, + "logps/rejected": -643.7384033203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4236364960670471, + "rewards/margins": 41.292572021484375, + "rewards/rejected": -41.71621322631836, + "step": 494 + }, + { + "epoch": 3.52, + "grad_norm": 0.0001012932416859143, + "learning_rate": 1.7033992697136928e-08, + "logits/chosen": -2.634021759033203, + "logits/rejected": -2.621947765350342, + "logps/chosen": -222.78009033203125, + "logps/rejected": -698.741455078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.666110038757324, + "rewards/margins": 44.308326721191406, + "rewards/rejected": -46.97443389892578, + "step": 495 + }, + { + "epoch": 3.52, + "grad_norm": 0.0012814306270193874, + "learning_rate": 1.6519707778996112e-08, + "logits/chosen": -2.65995717048645, + "logits/rejected": -2.5966336727142334, + "logps/chosen": -252.52761840820312, + "logps/rejected": -645.239501953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.911625385284424, + "rewards/margins": 38.49699401855469, + "rewards/rejected": -41.40861892700195, + "step": 496 + }, + { + "epoch": 3.53, + "grad_norm": 0.0004469534540923918, + "learning_rate": 1.6013040552935814e-08, + "logits/chosen": -2.657888650894165, + "logits/rejected": -2.5049285888671875, + "logps/chosen": -260.3426818847656, + "logps/rejected": -615.099853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1667189598083496, + "rewards/margins": 32.84905242919922, + "rewards/rejected": -35.015769958496094, + "step": 497 + }, + { + "epoch": 3.54, + "grad_norm": 0.0003533065525876613, + "learning_rate": 1.5514007549836977e-08, + "logits/chosen": -2.5815937519073486, + "logits/rejected": -2.5754480361938477, + "logps/chosen": -192.523681640625, + "logps/rejected": -673.796142578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.826329231262207, + "rewards/margins": 42.26731491088867, + "rewards/rejected": -45.09364700317383, + "step": 498 + }, + { + "epoch": 3.55, + "grad_norm": 0.0008820900757082024, + "learning_rate": 1.5022625051501146e-08, + "logits/chosen": -2.684786319732666, + "logits/rejected": -2.6221065521240234, + "logps/chosen": -224.79844665527344, + "logps/rejected": -731.311767578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.523193597793579, + "rewards/margins": 46.43081283569336, + "rewards/rejected": -48.95400619506836, + "step": 499 + }, + { + "epoch": 3.55, + "grad_norm": 0.0007022216597256002, + "learning_rate": 1.4538909090118846e-08, + "logits/chosen": -2.646857261657715, + "logits/rejected": -2.552997589111328, + "logps/chosen": -277.7613220214844, + "logps/rejected": -708.944580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8540782928466797, + "rewards/margins": 40.683101654052734, + "rewards/rejected": -43.53717803955078, + "step": 500 + }, + { + "epoch": 3.56, + "grad_norm": 7.626933847496309e-05, + "learning_rate": 1.4062875447747007e-08, + "logits/chosen": -2.594496965408325, + "logits/rejected": -2.5946741104125977, + "logps/chosen": -223.63864135742188, + "logps/rejected": -710.0927734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.271359920501709, + "rewards/margins": 43.48102569580078, + "rewards/rejected": -46.75238037109375, + "step": 501 + }, + { + "epoch": 3.57, + "grad_norm": 0.00024861932403556986, + "learning_rate": 1.3594539655793796e-08, + "logits/chosen": -2.687877893447876, + "logits/rejected": -2.6199045181274414, + "logps/chosen": -262.1263732910156, + "logps/rejected": -726.582275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.864876925945282, + "rewards/margins": 42.94416809082031, + "rewards/rejected": -43.809043884277344, + "step": 502 + }, + { + "epoch": 3.57, + "grad_norm": 2.4097112528033126e-06, + "learning_rate": 1.3133916994511773e-08, + "logits/chosen": -2.5927937030792236, + "logits/rejected": -2.61323618888855, + "logps/chosen": -210.65484619140625, + "logps/rejected": -718.58203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.27882719039917, + "rewards/margins": 46.08052062988281, + "rewards/rejected": -48.359344482421875, + "step": 503 + }, + { + "epoch": 3.58, + "grad_norm": 7.71270483531377e-06, + "learning_rate": 1.268102249249961e-08, + "logits/chosen": -2.631334066390991, + "logits/rejected": -2.575530529022217, + "logps/chosen": -273.2833557128906, + "logps/rejected": -669.0211791992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2079830169677734, + "rewards/margins": 39.595706939697266, + "rewards/rejected": -42.803688049316406, + "step": 504 + }, + { + "epoch": 3.59, + "grad_norm": 0.0008737956074188803, + "learning_rate": 1.2235870926211616e-08, + "logits/chosen": -2.5961055755615234, + "logits/rejected": -2.6211953163146973, + "logps/chosen": -198.38851928710938, + "logps/rejected": -682.2308959960938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3557233810424805, + "rewards/margins": 39.54448318481445, + "rewards/rejected": -42.900203704833984, + "step": 505 + }, + { + "epoch": 3.6, + "grad_norm": 0.0008795061494976251, + "learning_rate": 1.1798476819475545e-08, + "logits/chosen": -2.6413016319274902, + "logits/rejected": -2.5976343154907227, + "logps/chosen": -217.11343383789062, + "logps/rejected": -643.8162841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.70795476436615, + "rewards/margins": 39.021095275878906, + "rewards/rejected": -40.72905349731445, + "step": 506 + }, + { + "epoch": 3.6, + "grad_norm": 0.0006283498501396545, + "learning_rate": 1.136885444301891e-08, + "logits/chosen": -2.6185131072998047, + "logits/rejected": -2.582838535308838, + "logps/chosen": -219.7299041748047, + "logps/rejected": -705.4175415039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1128225326538086, + "rewards/margins": 40.27669143676758, + "rewards/rejected": -42.38951110839844, + "step": 507 + }, + { + "epoch": 3.61, + "grad_norm": 9.46580616287934e-05, + "learning_rate": 1.0947017814003257e-08, + "logits/chosen": -2.630901336669922, + "logits/rejected": -2.616990566253662, + "logps/chosen": -191.52870178222656, + "logps/rejected": -619.3540649414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.913646697998047, + "rewards/margins": 39.46686935424805, + "rewards/rejected": -42.380516052246094, + "step": 508 + }, + { + "epoch": 3.62, + "grad_norm": 0.00037132543557305, + "learning_rate": 1.053298069556685e-08, + "logits/chosen": -2.628443717956543, + "logits/rejected": -2.560385227203369, + "logps/chosen": -229.40475463867188, + "logps/rejected": -663.342529296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.225600481033325, + "rewards/margins": 39.87400436401367, + "rewards/rejected": -43.099609375, + "step": 509 + }, + { + "epoch": 3.62, + "grad_norm": 0.0007447269057197756, + "learning_rate": 1.0126756596375685e-08, + "logits/chosen": -2.5976340770721436, + "logits/rejected": -2.539245843887329, + "logps/chosen": -244.554443359375, + "logps/rejected": -738.638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5170507431030273, + "rewards/margins": 44.766361236572266, + "rewards/rejected": -47.28341293334961, + "step": 510 + }, + { + "epoch": 3.63, + "grad_norm": 6.896584690189299e-05, + "learning_rate": 9.728358770182632e-09, + "logits/chosen": -2.592754364013672, + "logits/rejected": -2.5863196849823, + "logps/chosen": -192.65524291992188, + "logps/rejected": -690.9212646484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1790452003479004, + "rewards/margins": 45.14292526245117, + "rewards/rejected": -47.32197189331055, + "step": 511 + }, + { + "epoch": 3.64, + "grad_norm": 0.00047320752889572296, + "learning_rate": 9.337800215395153e-09, + "logits/chosen": -2.5972468852996826, + "logits/rejected": -2.621363878250122, + "logps/chosen": -199.30767822265625, + "logps/rejected": -749.0303955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4131829738616943, + "rewards/margins": 47.307273864746094, + "rewards/rejected": -49.720455169677734, + "step": 512 + }, + { + "epoch": 3.64, + "grad_norm": 2.4917517056810806e-06, + "learning_rate": 8.955093674651138e-09, + "logits/chosen": -2.5982255935668945, + "logits/rejected": -2.5934972763061523, + "logps/chosen": -201.77896118164062, + "logps/rejected": -723.018310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4499902725219727, + "rewards/margins": 48.176429748535156, + "rewards/rejected": -49.62641906738281, + "step": 513 + }, + { + "epoch": 3.65, + "grad_norm": 0.0008180405526448468, + "learning_rate": 8.580251634403041e-09, + "logits/chosen": -2.6712136268615723, + "logits/rejected": -2.5899572372436523, + "logps/chosen": -199.0433807373047, + "logps/rejected": -614.0590209960938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8926000595092773, + "rewards/margins": 40.0224494934082, + "rewards/rejected": -40.9150505065918, + "step": 514 + }, + { + "epoch": 3.66, + "grad_norm": 0.0006238348559954941, + "learning_rate": 8.213286324510737e-09, + "logits/chosen": -2.57464861869812, + "logits/rejected": -2.5571444034576416, + "logps/chosen": -175.18162536621094, + "logps/rejected": -677.3331909179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3283417224884033, + "rewards/margins": 44.110809326171875, + "rewards/rejected": -46.43914794921875, + "step": 515 + }, + { + "epoch": 3.67, + "grad_norm": 9.958606067245237e-06, + "learning_rate": 7.85420971784223e-09, + "logits/chosen": -2.5616984367370605, + "logits/rejected": -2.4937665462493896, + "logps/chosen": -219.8465576171875, + "logps/rejected": -674.5919189453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4693191051483154, + "rewards/margins": 41.072147369384766, + "rewards/rejected": -42.541465759277344, + "step": 516 + }, + { + "epoch": 3.67, + "grad_norm": 0.001077652399715273, + "learning_rate": 7.50303352988324e-09, + "logits/chosen": -2.615463972091675, + "logits/rejected": -2.5997154712677, + "logps/chosen": -239.47915649414062, + "logps/rejected": -742.1309814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6052544116973877, + "rewards/margins": 44.79793167114258, + "rewards/rejected": -47.4031867980957, + "step": 517 + }, + { + "epoch": 3.68, + "grad_norm": 0.00047516180207854197, + "learning_rate": 7.1597692183548716e-09, + "logits/chosen": -2.652799129486084, + "logits/rejected": -2.569153070449829, + "logps/chosen": -227.27255249023438, + "logps/rejected": -670.1201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11313013732433319, + "rewards/margins": 44.74843978881836, + "rewards/rejected": -44.635311126708984, + "step": 518 + }, + { + "epoch": 3.69, + "grad_norm": 8.261507068562523e-05, + "learning_rate": 6.824427982839748e-09, + "logits/chosen": -2.6327054500579834, + "logits/rejected": -2.661642551422119, + "logps/chosen": -187.54229736328125, + "logps/rejected": -650.0072021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.911550998687744, + "rewards/margins": 38.738792419433594, + "rewards/rejected": -41.65034484863281, + "step": 519 + }, + { + "epoch": 3.69, + "grad_norm": 9.206493736216976e-07, + "learning_rate": 6.497020764416633e-09, + "logits/chosen": -2.675015449523926, + "logits/rejected": -2.6438331604003906, + "logps/chosen": -198.87820434570312, + "logps/rejected": -684.9061279296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2558302879333496, + "rewards/margins": 45.55830383300781, + "rewards/rejected": -47.81413650512695, + "step": 520 + }, + { + "epoch": 3.7, + "grad_norm": 0.0007809071733183488, + "learning_rate": 6.1775582453035545e-09, + "logits/chosen": -2.5540084838867188, + "logits/rejected": -2.55930757522583, + "logps/chosen": -220.89276123046875, + "logps/rejected": -759.425537109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5850014686584473, + "rewards/margins": 46.94213104248047, + "rewards/rejected": -50.52713394165039, + "step": 521 + }, + { + "epoch": 3.71, + "grad_norm": 6.764969813237873e-05, + "learning_rate": 5.86605084850908e-09, + "logits/chosen": -2.6058170795440674, + "logits/rejected": -2.5553784370422363, + "logps/chosen": -203.81137084960938, + "logps/rejected": -582.5670776367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5960230827331543, + "rewards/margins": 35.02447509765625, + "rewards/rejected": -37.62049865722656, + "step": 522 + }, + { + "epoch": 3.72, + "grad_norm": 0.00018714000875466163, + "learning_rate": 5.562508737492477e-09, + "logits/chosen": -2.6227502822875977, + "logits/rejected": -2.570919990539551, + "logps/chosen": -213.3363037109375, + "logps/rejected": -605.679931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.330808639526367, + "rewards/margins": 34.076942443847656, + "rewards/rejected": -37.407752990722656, + "step": 523 + }, + { + "epoch": 3.72, + "grad_norm": 2.7632096492312213e-05, + "learning_rate": 5.266941815831982e-09, + "logits/chosen": -2.6446104049682617, + "logits/rejected": -2.5895838737487793, + "logps/chosen": -239.43194580078125, + "logps/rejected": -797.6595458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4859473407268524, + "rewards/margins": 53.39768600463867, + "rewards/rejected": -53.883628845214844, + "step": 524 + }, + { + "epoch": 3.73, + "grad_norm": 1.0118631047717597e-05, + "learning_rate": 4.979359726901638e-09, + "logits/chosen": -2.689054489135742, + "logits/rejected": -2.6053109169006348, + "logps/chosen": -249.2718048095703, + "logps/rejected": -752.0653076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7038519382476807, + "rewards/margins": 50.081825256347656, + "rewards/rejected": -52.785675048828125, + "step": 525 + }, + { + "epoch": 3.74, + "grad_norm": 0.0005952093546128995, + "learning_rate": 4.6997718535568e-09, + "logits/chosen": -2.576707601547241, + "logits/rejected": -2.5225558280944824, + "logps/chosen": -218.0951690673828, + "logps/rejected": -636.01904296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.561260223388672, + "rewards/margins": 39.70025634765625, + "rewards/rejected": -43.26151657104492, + "step": 526 + }, + { + "epoch": 3.74, + "grad_norm": 0.00032767527772385046, + "learning_rate": 4.4281873178278475e-09, + "logits/chosen": -2.643268585205078, + "logits/rejected": -2.562049388885498, + "logps/chosen": -249.435302734375, + "logps/rejected": -736.1639404296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0804693698883057, + "rewards/margins": 46.055912017822266, + "rewards/rejected": -48.136383056640625, + "step": 527 + }, + { + "epoch": 3.75, + "grad_norm": 7.134858798105368e-07, + "learning_rate": 4.164614980622677e-09, + "logits/chosen": -2.6667962074279785, + "logits/rejected": -2.5475502014160156, + "logps/chosen": -266.4714660644531, + "logps/rejected": -706.01708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.186729073524475, + "rewards/margins": 38.75198745727539, + "rewards/rejected": -39.938720703125, + "step": 528 + }, + { + "epoch": 3.76, + "grad_norm": 0.000123378688887343, + "learning_rate": 3.909063441437627e-09, + "logits/chosen": -2.638554096221924, + "logits/rejected": -2.546849250793457, + "logps/chosen": -232.7906494140625, + "logps/rejected": -652.026611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7369204759597778, + "rewards/margins": 38.685546875, + "rewards/rejected": -39.422462463378906, + "step": 529 + }, + { + "epoch": 3.77, + "grad_norm": 0.00010777771752876477, + "learning_rate": 3.661541038076754e-09, + "logits/chosen": -2.6105692386627197, + "logits/rejected": -2.598308563232422, + "logps/chosen": -210.9410400390625, + "logps/rejected": -699.4739379882812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.09077787399292, + "rewards/margins": 42.617923736572266, + "rewards/rejected": -44.708702087402344, + "step": 530 + }, + { + "epoch": 3.77, + "grad_norm": 0.00024413808701914707, + "learning_rate": 3.4220558463799177e-09, + "logits/chosen": -2.6351587772369385, + "logits/rejected": -2.6326394081115723, + "logps/chosen": -242.35427856445312, + "logps/rejected": -732.5922241210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7585668563842773, + "rewards/margins": 43.73183059692383, + "rewards/rejected": -45.49039840698242, + "step": 531 + }, + { + "epoch": 3.78, + "grad_norm": 2.088248438094659e-05, + "learning_rate": 3.1906156799593185e-09, + "logits/chosen": -2.58453106880188, + "logits/rejected": -2.5550718307495117, + "logps/chosen": -209.361328125, + "logps/rejected": -663.1408081054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.398038387298584, + "rewards/margins": 37.62609100341797, + "rewards/rejected": -41.02412796020508, + "step": 532 + }, + { + "epoch": 3.79, + "grad_norm": 0.00010782961982631775, + "learning_rate": 2.9672280899444013e-09, + "logits/chosen": -2.578336000442505, + "logits/rejected": -2.574387788772583, + "logps/chosen": -220.75234985351562, + "logps/rejected": -832.4039306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.889570474624634, + "rewards/margins": 53.09165954589844, + "rewards/rejected": -55.981231689453125, + "step": 533 + }, + { + "epoch": 3.79, + "grad_norm": 0.0011012657277378198, + "learning_rate": 2.7519003647356875e-09, + "logits/chosen": -2.6087136268615723, + "logits/rejected": -2.556013822555542, + "logps/chosen": -230.90760803222656, + "logps/rejected": -752.01220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.903672218322754, + "rewards/margins": 51.96900177001953, + "rewards/rejected": -53.87267303466797, + "step": 534 + }, + { + "epoch": 3.8, + "grad_norm": 0.00045847020454420573, + "learning_rate": 2.5446395297668287e-09, + "logits/chosen": -2.6001996994018555, + "logits/rejected": -2.614643096923828, + "logps/chosen": -206.0194091796875, + "logps/rejected": -727.6276245117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.560021162033081, + "rewards/margins": 49.66565704345703, + "rewards/rejected": -52.225677490234375, + "step": 535 + }, + { + "epoch": 3.81, + "grad_norm": 3.336203614344873e-05, + "learning_rate": 2.345452347275456e-09, + "logits/chosen": -2.5965325832366943, + "logits/rejected": -2.55116868019104, + "logps/chosen": -200.4416961669922, + "logps/rejected": -645.6135864257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.620997905731201, + "rewards/margins": 41.373722076416016, + "rewards/rejected": -43.994720458984375, + "step": 536 + }, + { + "epoch": 3.82, + "grad_norm": 0.0006999567971533117, + "learning_rate": 2.1543453160826066e-09, + "logits/chosen": -2.6066911220550537, + "logits/rejected": -2.562065601348877, + "logps/chosen": -227.69476318359375, + "logps/rejected": -679.8926391601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.799435615539551, + "rewards/margins": 40.542022705078125, + "rewards/rejected": -44.34145736694336, + "step": 537 + }, + { + "epoch": 3.82, + "grad_norm": 0.0005604638901911069, + "learning_rate": 1.9713246713805587e-09, + "logits/chosen": -2.6190500259399414, + "logits/rejected": -2.6056013107299805, + "logps/chosen": -239.04745483398438, + "logps/rejected": -728.4417114257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.335944890975952, + "rewards/margins": 43.79815673828125, + "rewards/rejected": -46.134098052978516, + "step": 538 + }, + { + "epoch": 3.83, + "grad_norm": 4.7783750743188536e-06, + "learning_rate": 1.7963963845294139e-09, + "logits/chosen": -2.661330223083496, + "logits/rejected": -2.538231372833252, + "logps/chosen": -260.2390441894531, + "logps/rejected": -719.76904296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5885722637176514, + "rewards/margins": 46.47611618041992, + "rewards/rejected": -47.064693450927734, + "step": 539 + }, + { + "epoch": 3.84, + "grad_norm": 0.0006786773137310991, + "learning_rate": 1.6295661628624447e-09, + "logits/chosen": -2.636453628540039, + "logits/rejected": -2.52661395072937, + "logps/chosen": -278.93731689453125, + "logps/rejected": -737.27392578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3505178689956665, + "rewards/margins": 44.48197937011719, + "rewards/rejected": -45.832496643066406, + "step": 540 + }, + { + "epoch": 3.84, + "grad_norm": 0.0006463777932563014, + "learning_rate": 1.470839449499689e-09, + "logits/chosen": -2.5432991981506348, + "logits/rejected": -2.5113589763641357, + "logps/chosen": -189.8641815185547, + "logps/rejected": -760.866943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.735294818878174, + "rewards/margins": 51.046661376953125, + "rewards/rejected": -53.78195571899414, + "step": 541 + }, + { + "epoch": 3.85, + "grad_norm": 0.0008086731024632337, + "learning_rate": 1.320221423170398e-09, + "logits/chosen": -2.6137325763702393, + "logits/rejected": -2.571281671524048, + "logps/chosen": -201.52828979492188, + "logps/rejected": -667.669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8557465076446533, + "rewards/margins": 43.3610954284668, + "rewards/rejected": -46.21684265136719, + "step": 542 + }, + { + "epoch": 3.86, + "grad_norm": 7.948441023921063e-05, + "learning_rate": 1.1777169980441703e-09, + "logits/chosen": -2.5721867084503174, + "logits/rejected": -2.5402979850769043, + "logps/chosen": -225.49884033203125, + "logps/rejected": -624.1567993164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9386448860168457, + "rewards/margins": 36.49800109863281, + "rewards/rejected": -40.4366455078125, + "step": 543 + }, + { + "epoch": 3.87, + "grad_norm": 2.2020637149173167e-06, + "learning_rate": 1.0433308235704985e-09, + "logits/chosen": -2.622354507446289, + "logits/rejected": -2.592653751373291, + "logps/chosen": -204.864990234375, + "logps/rejected": -686.6837158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1060729026794434, + "rewards/margins": 42.62062072753906, + "rewards/rejected": -45.7266960144043, + "step": 544 + }, + { + "epoch": 3.87, + "grad_norm": 0.0007915969372839803, + "learning_rate": 9.170672843271666e-10, + "logits/chosen": -2.625001907348633, + "logits/rejected": -2.572631359100342, + "logps/chosen": -236.1276397705078, + "logps/rejected": -666.9713134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8551907539367676, + "rewards/margins": 38.6225700378418, + "rewards/rejected": -42.477760314941406, + "step": 545 + }, + { + "epoch": 3.88, + "grad_norm": 3.270880714896315e-05, + "learning_rate": 7.989304998771151e-10, + "logits/chosen": -2.623558521270752, + "logits/rejected": -2.571274757385254, + "logps/chosen": -193.09713745117188, + "logps/rejected": -663.9268188476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2196335792541504, + "rewards/margins": 41.74732971191406, + "rewards/rejected": -44.96696090698242, + "step": 546 + }, + { + "epoch": 3.89, + "grad_norm": 0.000151216199223518, + "learning_rate": 6.88924324634077e-10, + "logits/chosen": -2.6475632190704346, + "logits/rejected": -2.5866947174072266, + "logps/chosen": -218.62295532226562, + "logps/rejected": -649.5780029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.143815279006958, + "rewards/margins": 38.69684600830078, + "rewards/rejected": -39.84066390991211, + "step": 547 + }, + { + "epoch": 3.89, + "grad_norm": 0.0007736294437630317, + "learning_rate": 5.870523477368439e-10, + "logits/chosen": -2.5414252281188965, + "logits/rejected": -2.5908203125, + "logps/chosen": -223.25921630859375, + "logps/rejected": -722.0737915039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5433340072631836, + "rewards/margins": 41.699283599853516, + "rewards/rejected": -45.24261474609375, + "step": 548 + }, + { + "epoch": 3.9, + "grad_norm": 1.9519572169989503e-05, + "learning_rate": 4.933178929321102e-10, + "logits/chosen": -2.6471638679504395, + "logits/rejected": -2.5520832538604736, + "logps/chosen": -264.80438232421875, + "logps/rejected": -674.8248291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9683010578155518, + "rewards/margins": 40.30274963378906, + "rewards/rejected": -41.27105712890625, + "step": 549 + }, + { + "epoch": 3.91, + "grad_norm": 2.2893564144748588e-05, + "learning_rate": 4.0772401846608794e-10, + "logits/chosen": -2.62985897064209, + "logits/rejected": -2.5865864753723145, + "logps/chosen": -212.1999053955078, + "logps/rejected": -747.483154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9118320941925049, + "rewards/margins": 50.968971252441406, + "rewards/rejected": -52.88079833984375, + "step": 550 + }, + { + "epoch": 3.91, + "grad_norm": 0.0006555504801010508, + "learning_rate": 3.3027351698464155e-10, + "logits/chosen": -2.6399097442626953, + "logits/rejected": -2.570702075958252, + "logps/chosen": -245.3954315185547, + "logps/rejected": -759.6217041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8222752809524536, + "rewards/margins": 48.191375732421875, + "rewards/rejected": -50.01364517211914, + "step": 551 + }, + { + "epoch": 3.92, + "grad_norm": 0.0008416250019261044, + "learning_rate": 2.609689154422778e-10, + "logits/chosen": -2.651149272918701, + "logits/rejected": -2.6588101387023926, + "logps/chosen": -219.4056396484375, + "logps/rejected": -761.507568359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.141348123550415, + "rewards/margins": 46.26686477661133, + "rewards/rejected": -47.40821075439453, + "step": 552 + }, + { + "epoch": 3.93, + "grad_norm": 0.0002409509965469598, + "learning_rate": 1.998124750196284e-10, + "logits/chosen": -2.6847646236419678, + "logits/rejected": -2.627934455871582, + "logps/chosen": -232.39654541015625, + "logps/rejected": -663.7286376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.612367630004883, + "rewards/margins": 39.28120422363281, + "rewards/rejected": -41.89356994628906, + "step": 553 + }, + { + "epoch": 3.94, + "grad_norm": 0.0006596279634716474, + "learning_rate": 1.468061910496754e-10, + "logits/chosen": -2.642146110534668, + "logits/rejected": -2.596142053604126, + "logps/chosen": -205.49057006835938, + "logps/rejected": -748.7030029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5633480548858643, + "rewards/margins": 49.03366470336914, + "rewards/rejected": -52.597007751464844, + "step": 554 + }, + { + "epoch": 3.94, + "grad_norm": 7.327209994068125e-05, + "learning_rate": 1.0195179295269252e-10, + "logits/chosen": -2.693852186203003, + "logits/rejected": -2.555957794189453, + "logps/chosen": -278.0377197265625, + "logps/rejected": -723.62841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43430817127227783, + "rewards/margins": 44.582855224609375, + "rewards/rejected": -45.01716613769531, + "step": 555 + }, + { + "epoch": 3.95, + "grad_norm": 0.0006580150051386396, + "learning_rate": 6.525074417979004e-11, + "logits/chosen": -2.626962900161743, + "logits/rejected": -2.5788369178771973, + "logps/chosen": -278.2981262207031, + "logps/rejected": -692.57568359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.301385879516602, + "rewards/margins": 37.807674407958984, + "rewards/rejected": -42.10906219482422, + "step": 556 + }, + { + "epoch": 3.96, + "grad_norm": 0.0003003106632929461, + "learning_rate": 3.670424216520307e-11, + "logits/chosen": -2.632838726043701, + "logits/rejected": -2.587643623352051, + "logps/chosen": -272.41119384765625, + "logps/rejected": -755.6422119140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.258763313293457, + "rewards/margins": 43.71070098876953, + "rewards/rejected": -47.96946334838867, + "step": 557 + }, + { + "epoch": 3.96, + "grad_norm": 0.0003243619290241665, + "learning_rate": 1.6313218287128394e-11, + "logits/chosen": -2.5734102725982666, + "logits/rejected": -2.5828938484191895, + "logps/chosen": -197.8962860107422, + "logps/rejected": -674.5167236328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.271682262420654, + "rewards/margins": 42.28971862792969, + "rewards/rejected": -46.5614013671875, + "step": 558 + }, + { + "epoch": 3.97, + "grad_norm": 0.00022653861509093804, + "learning_rate": 4.078337837470913e-12, + "logits/chosen": -2.645965337753296, + "logits/rejected": -2.614337205886841, + "logps/chosen": -194.5188751220703, + "logps/rejected": -715.8605346679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8231751918792725, + "rewards/margins": 45.57339859008789, + "rewards/rejected": -47.396568298339844, + "step": 559 + }, + { + "epoch": 3.98, + "grad_norm": 0.00017868578829305034, + "learning_rate": 0.0, + "logits/chosen": -2.614107131958008, + "logits/rejected": -2.5191268920898438, + "logps/chosen": -279.91717529296875, + "logps/rejected": -695.2500610351562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8186190128326416, + "rewards/margins": 43.27680206298828, + "rewards/rejected": -46.09542465209961, + "step": 560 + } + ], + "logging_steps": 1.0, + "max_steps": 560, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "total_flos": 36777767731200.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}